Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to force unlock TF state and fix digest value of lock file in DynamoDB #235

Merged
merged 13 commits into from
Sep 7, 2023
32 changes: 32 additions & 0 deletions bin/fmt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: Thanks for adding this!!

Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#! /usr/bin/env bash

# DOC: Format terraform, python, and shell script files.

if which -s terraform; then
echo "Formatting Terraform files"
terraform fmt -recursive -write
else
echo "Can not find the terraform binary. Please install Terraform first."
exit 1
fi

if which -s shfmt; then
echo "Formatting shell scripts"
shfmt -bn -ci -i 2 -w -l $(shfmt -f .)
else
echo "Could not find the shfmt binary. Please install shfmt first."
exit 1
fi

if which -s yapf; then
echo "Formatting python files"
yapf \
--verbose \
--style='{based_on_style: google, SPLIT_BEFORE_FIRST_ARGUMENT:true}' \
--in-place \
--recursive \
.
else
echo "Could not find yapf. Please install it first."
exit 1
fi
29 changes: 26 additions & 3 deletions cloud/aws/templates/aws_oidc/bin/aws_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,24 @@ def wait_for_ecs_service_healthy(self):
)
time.sleep(30)

def set_lock_table_digest_value(self, value):
"""
Sets the lock file digest value in DynamoDB to the given value. This
digest value is a checksum of the Terraform state file stored in S3.

If something goes wrong during deployment, especially when a user has
force-unlocked due to a previous issue and then multiple apply actions
are happening at once, the digest value for the Terraform lock file in
S3 can be incorrect. This function lets us set the digest value to
the correct value, as given by the error message of a previous
Terraform command, without having to go into the AWS console to
set it manually.
"""
table = f'{self.config.app_prefix}-{resources.S3_TERRAFORM_LOCK_TABLE}'
file = f'{self.config.app_prefix}-{resources.S3_TERRAFORM_STATE_BUCKET}'
command = f'dynamodb put-item --table-name={table} --item=\'{{"LockID":{{"S":"{file}/tfstate/terraform.tfstate-md5"}},"Digest":{{"S":"{value}"}}}}\''
self._call_cli(command, False) # output = False

def _ecs_service_state(self) -> Dict:
"""
Returns the ID and rolloutState of the PRIMARY ECS service deployment. If
Expand Down Expand Up @@ -169,7 +187,12 @@ def get_url_of_secret(self, secret_name: str) -> str:
def get_url_of_s3_bucket(self, bucket_name: str) -> str:
return f"https://{self.config.aws_region}.console.aws.amazon.com/s3/buckets/{bucket_name}"

def _call_cli(self, command: str) -> Dict:
command = f"aws --output=json --region={self.config.aws_region} " + command
def _call_cli(self, command: str, output: bool = True) -> Dict:
base = f"aws --region={self.config.aws_region} "
if output:
base += "--output=json "
command = base + command
out = subprocess.check_output(shlex.split(command))
return json.loads(out.decode("ascii"))
if output:
return json.loads(out.decode("ascii"))
return
157 changes: 141 additions & 16 deletions cloud/shared/bin/lib/terraform.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,119 @@
import subprocess
import os
import sys
import re
import shutil
import shlex
import inspect
from typing import Optional

from cloud.shared.bin.lib.config_loader import ConfigLoader
from cloud.shared.bin.lib.print import print
from cloud.aws.templates.aws_oidc.bin.aws_cli import AwsCli


# TODO(#2741): When using this for Azure make sure to setup backend bucket prior to calling these functions.
def perform_apply(
def force_unlock(
config_loader: ConfigLoader,
is_destroy=False,
terraform_template_dir: Optional[str] = None):
'''Generates terraform variable files and runs terraform init and apply.'''
lock_id: str,
terraform_template_dir: Optional[str] = None,
initialize=True):
if not terraform_template_dir:
terraform_template_dir = config_loader.get_template_dir()
tf_vars_filename = config_loader.tfvars_filename

terraform_cmd = f'terraform -chdir={terraform_template_dir}'
if initialize:
perform_init(
config_loader, terraform_template_dir, False) # upgrade = False

terraform_cmd = f'terraform -chdir={terraform_template_dir} force-unlock -force {lock_id}'
print(f" - Run {terraform_cmd}")
subprocess.check_call(shlex.split(terraform_cmd))


def perform_init(
config_loader: ConfigLoader,
terraform_template_dir: Optional[str] = None,
upgrade: bool = True):
if not terraform_template_dir:
terraform_template_dir = config_loader.get_template_dir()

init_cmd = f'terraform -chdir={terraform_template_dir} init'
if upgrade:
init_cmd += ' -upgrade'

if config_loader.use_local_backend:
print(' - Run terraform init -upgrade -reconfigure')
subprocess.check_call(
shlex.split(f'{terraform_cmd} init -upgrade -reconfigure'))
init_cmd += ' -reconfigure'
else:
print(' - Run terraform init -upgrade')
init_cmd = f'{terraform_cmd} init -input=false -upgrade'
init_cmd += ' -input=false'
# backend vars file can be absent when pre-terraform setup is running
if os.path.exists(os.path.join(terraform_template_dir,
config_loader.backend_vars_filename)):
init_cmd += f' -backend-config={config_loader.backend_vars_filename}'
subprocess.check_call(shlex.split(init_cmd))
print(f" - Run {init_cmd}")
output, exit_code = capture_stderr(init_cmd)
if exit_code > 0:
# Determine if we're running interactively
is_tty = sys.stdin.isatty()
nb1701 marked this conversation as resolved.
Show resolved Hide resolved
# This is AWS-specific, and should be modified when we have actual
# Azure deployments
if 'state data in S3 does not have the expected content' in output:
match = re.search(r'value: ([0-9a-f]{32})', output)
if match:
digest = match.group(match.lastindex)
if is_tty:
answer = input(
"Would you like to fix this by setting the correct digest value? Ensure that no other deployment processes are in progress. [Y/n] >"
)
if answer.lower() in ['y', 'yes', '']:
aws = AwsCli(config_loader)
aws.set_lock_table_digest_value(digest)
perform_init(
config_loader, terraform_template_dir, upgrade)
return
print(
f"To fix the above error, rerun the command with LOCK_TABLE_DIGEST_VALUE=\"{digest}\" before it."
)
# Since we've handled the error and printed a message, exit immediately
# rather than returning False and having it print a stack trace.
exit(exit_code)
raise RuntimeError(
"Unhandled error during terraform init. See error message above for details."
)


# We specifically don't want to capture stdout here. When running in interactive mode,
# we'd miss the prompt to enter "yes" to continue on a terraform apply, even if we're
# printing each line as it comes in, since the line the prompt is on does not contain
# a new line character.
def capture_stderr(cmd):
popen = subprocess.Popen(
shlex.split(cmd),
stderr=subprocess.PIPE,
bufsize=1,
universal_newlines=True)
try:
exit_code = popen.wait()
_, stderr = popen.communicate()
if stderr:
print(stderr)
return stderr, exit_code
except KeyboardInterrupt:
# Allow terraform to gracefully exit if a user Ctrl+C's out of the command
popen.terminate()


# TODO(#2741): When using this for Azure make sure to setup backend bucket prior to calling these functions.
def perform_apply(
config_loader: ConfigLoader,
is_destroy=False,
terraform_template_dir: Optional[str] = None,
initialize=True):
'''Generates terraform variable files and runs terraform init and apply.'''
if not terraform_template_dir:
terraform_template_dir = config_loader.get_template_dir()
tf_vars_filename = config_loader.tfvars_filename

if initialize:
perform_init(config_loader, terraform_template_dir)

if os.path.exists(os.path.join(terraform_template_dir, tf_vars_filename)):
print(
Expand All @@ -45,16 +127,59 @@ def perform_apply(
print(" - Test. Not applying terraform.")
return True

print(" - Run terraform apply")
# Enable compact-warnings as we have a bunch of
# "value of undeclared variables" warnings as some variables used in one
# deployment (e.g. aws) but not the other.
terraform_apply_cmd = f'{terraform_cmd} apply -input=false -var-file={tf_vars_filename} -compact-warnings'
terraform_apply_cmd = f'terraform -chdir={terraform_template_dir} apply -input=false -var-file={tf_vars_filename} -compact-warnings'
if config_loader.skip_confirmations:
terraform_apply_cmd += ' -auto-approve'
if is_destroy:
terraform_apply_cmd += ' -destroy'
subprocess.check_call(shlex.split(terraform_apply_cmd))

print(f" - Run {terraform_apply_cmd}")

output, exit_code = capture_stderr(terraform_apply_cmd)
if exit_code > 0:
# Determine if we're running interactively
is_tty = sys.stdin.isatty()
if "Error acquiring the state lock" in output:
# Lock ID is a standard UUID v4 in the form 00000000-0000-0000-0000-000000000000
match = re.search(
r'ID:\s+([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})',
output)
error_text = inspect.cleandoc(
"""
The Terraform state lock can not be acquired.
This can happen if you are running a command in another process, or if another Terraform process exited prematurely.
""")
if match:
dkatzz marked this conversation as resolved.
Show resolved Hide resolved
lock_id = match.group(match.lastindex)
if is_tty:
answer = input(
"Would you like to fix this by force-unlocking the Terraform state? Ensure that no other deployment processes are in progress. [Y/n] >"
)
if answer.lower() in ['y', 'yes', '']:
force_unlock(
config_loader, lock_id, terraform_template_dir,
False) # initialize = False
return perform_apply(
config_loader, is_destroy, terraform_template_dir,
False) # initialize = False
print(
error_text +
f"\nIf you are sure there are no other Terraform processes running, this can be fixed by rerunning the command with FORCE_UNLOCK_ID=\"{lock_id}\" before it."
)
else:
print(
error_text +
"\nWe were unable to extract the lock ID from the error text. Inspect the error message above."
"\nIf you are sure there are no other Terraform processes running, this error can be fixed by rerunning the command with FORCE_UNLOCK_ID=<Lock ID> before it."
)
# Since we've handled the error and printed a message, exit immediately
# rather than returning False and having it print a stack trace.
exit(exit_code)
return False

return True


Expand Down
14 changes: 12 additions & 2 deletions cloud/shared/bin/run
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ set -o pipefail
source cloud/shared/bin/python_env_setup

# Get the arguments that we want to pass to run.py
while getopts s:c:t: flag; do
while getopts s:c:t:u:d: flag; do
case "${flag}" in
# The civiform_config file that contains the values to configure the deployment
s) source_config=${OPTARG} ;;
Expand Down Expand Up @@ -115,4 +115,14 @@ echo "env-var-docs @ git+https://github.com/civiform/civiform.git@${commit_sha}\

initialize_python_env $dependencies_file_path

cloud/shared/bin/run.py --command $command --tag $tag --config $source_config
args=("--command" "${command}" "--tag" "${tag}" "--config" "${source_config}")

if [[ -n "${FORCE_UNLOCK_ID}" ]]; then
args=("${args[@]}" "--force-unlock" "${FORCE_UNLOCK_ID}")
fi

if [[ -n "${LOCK_TABLE_DIGEST_VALUE}" ]]; then
args=("${args[@]}" "--lock-table-digest-value" "${LOCK_TABLE_DIGEST_VALUE}")
fi

cloud/shared/bin/run.py "${args[@]}"
22 changes: 22 additions & 0 deletions cloud/shared/bin/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from cloud.shared.bin.lib.print import print
from cloud.shared.bin.lib.write_tfvars import TfVarWriter
from cloud.shared.bin.lib import backend_setup
from cloud.shared.bin.lib import terraform
from cloud.aws.templates.aws_oidc.bin.aws_cli import AwsCli

_CIVIFORM_RELEASE_TAG_REGEX = re.compile(r'^v?[0-9]+\.[0-9]+\.[0-9]+$')

Expand All @@ -32,6 +34,14 @@ def main():
'--config',
default='civiform_config.sh',
help='Path to CiviForm deployment config file.')
parser.add_argument(
'--force-unlock',
help='Lock ID to force unlock before performing the Terraform apply.')
parser.add_argument(
'--lock-table-digest-value',
help=
'Digest value for the Terraform lock table to set in DynamoDB. If multiple processes are doing a deploy, or an error occurred in a previous deploy that prevented Terraform from cleaning up after itself, this value may need updating. Only works on AWS deployments.'
)

args = parser.parse_args()
if args.tag:
Expand All @@ -55,6 +65,18 @@ def main():
# Setup backend
backend_setup.setup_backend(config)

# Run the command to force unlock the TF state lock
if args.force_unlock:
print("Force unlocking the Terraform state")
terraform.force_unlock(config, args.force_unlock)

if args.lock_table_digest_value:
print(
f"Fixing the lock file digest value in DynamoDB, setting it to {args.lock_table_digest_value}"
)
aws = AwsCli(config)
aws.set_lock_table_digest_value(args.lock_table_digest_value)

# Write the passthrough vars to a temporary file
print("Writing TF Vars file")
terraform_tfvars_path = os.path.join(
Expand Down