Deploy Recipes to Google Dataflow #12
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Deploy Recipes to Google Dataflow | |
env: | |
JOB_NAME: ${{ github.event.inputs.recipe_id }}-${{ github.run_id }}-${{ github.run_attempt }} | |
on: | |
workflow_dispatch: | |
inputs: | |
recipe_id: | |
description: 'The id of a single recipe to submit to Dataflow' | |
required: true | |
default: 'all' | |
jobs: | |
deploy-recipes: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: "Authenticate to Google Cloud" | |
id: "auth" | |
uses: "google-github-actions/auth@v2" | |
with: | |
credentials_json: "${{ secrets.LEAP_BAKERY_SERVICE_ACCOUNT }}" | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: '3.12' | |
- name: "Install deps" | |
run: | | |
python -m pip install --upgrade pip | |
pip install google-cloud | |
# pip install apache-beam[gcp] xarray_beam xarray kerchunk fastparquet pyarrow gcsfs google-cloud | |
# - name: Get service account for beam | |
# id: get-account | |
# run: | | |
# echo "SA_EMAIL=$(gcloud config get-value account)" >> $GITHUB_OUTPUT | |
# echo $SA_EMAIL | |
# echo $GITHUB_OUTPUT | |
- name : "Deploy Beam pipeline" | |
run: | | |
python feedstock/recipe.py \ | |
--runner DataflowRunner \ | |
--requirements_file requirements.txt \ | |
--project leap-pangeo \ | |
--job_name $JOB_NAME \ | |
--region us-central1 \ | |
--machine-type n1-highmem-2 \ | |
--max_num_workers 10 \ | |
--temp_location gs://leap-scratch/norlandrhagen/dataflow_temp/temp/ \ | |
--staging_location gs://leap-scratch/norlandrhagen/dataflow_temp/staging/ | |
--input gs://leap-scratch/norlandrhagen/dataflow_temp/input/ \ | |
--output gs://leap-scratch/norlandrhagen/dataflow_temp/output/ \ | |
--service_account_email [email protected] \ | |
# - name: Wait for Dataflow jobs to finish | |
# # I tried to make this reusable but the fucking thing would not accept env.JOB_NAME as input. | |
# # AT that point, screw it, not worth it. | |
# run: | | |
# jobname="${{ env.JOB_NAME }}" | |
# while true; do | |
# count=$(gcloud dataflow jobs list --status=active --filter="name:${jobname}" --format="value(id)" | wc -l) | |
# echo "Active Dataflow jobs: $count" | |
# if [ "$count" -eq "0" ]; then | |
# echo "No active Dataflow jobs found." | |
# break | |
# fi | |
# echo "Waiting for Dataflow jobs to finish..." | |
# sleep 20 | |
# done |