From dd9d0110becd9fb41319d719776a402c4d36cb17 Mon Sep 17 00:00:00 2001 From: Dave Morris Date: Tue, 27 Feb 2024 12:31:01 +0000 Subject: [PATCH] Testing connection issues on Arcus --- notes/zrq/20240224-01-arcus-tests.txt | 179 ++++++++++++++++++++++++ notes/zrq/20240225-01-arcus-tests.txt | 87 ++++++++++++ notes/zrq/20240225-02-arcus-tests.txt | 186 +++++++++++++++++++++++++ notes/zrq/20240227-01-arcus-tests.txt | 193 ++++++++++++++++++++++++++ 4 files changed, 645 insertions(+) create mode 100644 notes/zrq/20240224-01-arcus-tests.txt create mode 100644 notes/zrq/20240225-01-arcus-tests.txt create mode 100644 notes/zrq/20240225-02-arcus-tests.txt create mode 100644 notes/zrq/20240227-01-arcus-tests.txt diff --git a/notes/zrq/20240224-01-arcus-tests.txt b/notes/zrq/20240224-01-arcus-tests.txt new file mode 100644 index 00000000..2300941a --- /dev/null +++ b/notes/zrq/20240224-01-arcus-tests.txt @@ -0,0 +1,179 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Working with Paul Browne to diagnose issues with the Arcus cloud. + https://github.com/wfau/gaia-dmp/issues/1308 + https://ucam-rcs.atlassian.net/servicedesk/customer/portal/4/HPCSSUP-67058 + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Create a new branch for our test deployments. +#[user@desktop] + + branchname=investigations + + source "${HOME:?}/aglais.env" + pushd "${AGLAIS_CODE}" + + newbranch=$(date '+%Y%m%d')-zrq-${branchname:?} + + git checkout master + + git checkout -b "${newbranch:?}" + + git push --set-upstream 'origin' "$(git branch --show-current)" + + popd + + +# ----------------------------------------------------- +# Repair the DNS record for the red deployment. +#[user@desktop] + + source "${HOME:?}/aglais.env" + ansi-client 'red' + + source /deployments/admin/bin/create-user-tools.sh + ducktoken=$(getsecret 'devops.duckdns.token') + + ipaddress=128.232.226.223 + curl "https://www.duckdns.org/update/${cloudname:?}/${ducktoken:?}/${ipaddress:?}" + + +# ----------------------------------------------------- +# Transfer Paul's ssh key onto the three key machines. +#[user@desktop] + + sshkey="ssh-rsa AAAA....Irhz" + + echo "sshkey [${sshkey}]" + + echo "${sshkey}" > /tmp/pfb29.cam.ac.uk.pub + + cat /tmp/pfb29.cam.ac.uk.pub + + scp /tmp/pfb29.cam.ac.uk.pub \ + fedora@data.gaia-dmp.uk:.ssh/pfb29.cam.ac.uk.pub + + scp /tmp/pfb29.cam.ac.uk.pub \ + fedora@red.gaia-dmp.uk:.ssh/pfb29.cam.ac.uk.pub + + scp /tmp/pfb29.cam.ac.uk.pub \ + fedora@green.gaia-dmp.uk:.ssh/pfb29.cam.ac.uk.pub + + scp /tmp/pfb29.cam.ac.uk.pub \ + fedora@blue.gaia-dmp.uk:.ssh/pfb29.cam.ac.uk.pub + + + ssh fedora@data.gaia-dmp.uk + ssh fedora@red.gaia-dmp.uk + ssh fedora@green.gaia-dmp.uk + ssh fedora@blue.gaia-dmp.uk + + + cd .ssh + cp authorized_keys authorized_keys.old + + cat pfb29.cam.ac.uk.pub >> authorized_keys + + cat authorized_keys + + + ssh fedora@red.gaia-dmp.uk + ssh fedora@green.gaia-dmp.uk + ssh fedora@blue.gaia-dmp.uk + + + + ssh data.gaia-dmp.uk "date ; hostname" + + curl --head 'https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_e216e6b502134b6185380be6ccd0bf09/archive/zeppelin-0.10.1-gaia-dmp-0.1.tar.gz' + + +# ----------------------------------------------------- + + # + # Test things on sunday 25th + # + + ssh desktop + [user@desktop] + + ssh fedora@red.gaia-dmp.uk + + [fedora@iris-gaia-red-20240223-zeppelin ~]$ + + ssh data.gaia-dmp.uk "date ; hostname" + + Sun 25 Feb 2024 10:35:45 PM UTC + iris-gaia-data-20220411-gitstore + + curl --head 'https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_e216e6b502134b6185380be6ccd0bf09/archive/zeppelin-0.10.1-gaia-dmp-0.1.tar.gz' + + HTTP/1.1 200 OK + Content-Length: 1716996866 + Accept-Ranges: bytes + .... + + + ssh fedora@green.gaia-dmp.uk + + [fedora@iris-gaia-green-20231027-zeppelin ~]$ + + ssh data.gaia-dmp.uk "date ; hostname" + + Sun 25 Feb 22:38:00 UTC 2024 + iris-gaia-data-20220411-gitstore + + curl --head 'https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_e216e6b502134b6185380be6ccd0bf09/archive/zeppelin-0.10.1-gaia-dmp-0.1.tar.gz' + + HTTP/1.1 200 OK + Content-Length: 1716996866 + Accept-Ranges: bytes + .... + + + ssh fedora@blue.gaia-dmp.uk + + blue is broken + one vm from 2 days ago stuck in 'deleting' + + why did blue work yesterday ? + and why does it fail today ? + + + + + + + diff --git a/notes/zrq/20240225-01-arcus-tests.txt b/notes/zrq/20240225-01-arcus-tests.txt new file mode 100644 index 00000000..7f453b2a --- /dev/null +++ b/notes/zrq/20240225-01-arcus-tests.txt @@ -0,0 +1,87 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Test to see if the platform is working today. + + Result: + + Work in progress ... + + +# ----------------------------------------------------- +# From previous notes [notes/zrq/20240213-01-bash-dash.txt] +# Clean deploy and import our test users. +#[user@desktop] + + source "${HOME:?}/aglais.env" + ansi-client 'blue' + + source /deployments/hadoop-yarn/bin/deploy.sh + + > aglais: + > status: + > deployment: + > type: hadoop-yarn + > conf: zeppelin-54.86-spark-6.26.43 + > name: iris-gaia-blue-20240225 + > date: 20240225T225235 + > hostname: zeppelin.gaia-dmp.uk + > spec: + > openstack: + > cloud: + > base: arcus + > name: iris-gaia-blue + + + source /deployments/admin/bin/create-user-tools.sh + import-test-users + + > .... + > .... + + > "msg": " + > Error mounting /user/Thozzt: + > 2024-02-25T23:42:39.818+0000 7f1c266afec0 -1 + > auth: error parsing file /etc/ceph/ceph.client.iris-gaia-blue-user-Thozzt-rw.keyring: + > error setting modifier for [client.iris-gaia-blue-user-Thozzt-rw] type=key val=null: + > Malformed input [buffer:3] + > 2024-02-25T23:42:39.818+0000 7f1c266afec0 -1 auth: + > failed to load /etc/ceph/ceph.client.iris-gaia-blue-user-Thozzt-rw.keyring: + > (5) Input/output error\nmount error: + > no mds server is up or the cluster is laggy + > " + + # + # Main deployment looks OK, but lots of errors with CephFS mounts. + # + + + + diff --git a/notes/zrq/20240225-02-arcus-tests.txt b/notes/zrq/20240225-02-arcus-tests.txt new file mode 100644 index 00000000..42f41704 --- /dev/null +++ b/notes/zrq/20240225-02-arcus-tests.txt @@ -0,0 +1,186 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Test to see if the Kubernetes deploy works on Arcus. + Now that the networking issues seen by the Ansible deploy have been fixed. + Interesting to see if Kubernetes now works. + + Result: + + Nope, still broken. + + +# ----------------------------------------------------- +# Run our local client. +#[user@desktop] + + source "${HOME:?}/aglais.env" + export PATH=${PATH}:${AGLAIS_CODE}/bin + + kube-client blue + + > .... + > .... + + +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + + > .... + > .... + > PLAY RECAP ************************************************************************************************************************************* + > bootstrap : ok=54 changed=43 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + > localhost : ok=35 changed=26 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + + +# ----------------------------------------------------- +# Check the deployment configuration. +#[root@ansibler] + + cat /opt/aglais/aglais-status.yml + + > aglais: + > deployment: + > date: 20240225 + > debug: + > started: '2024-02-25 23:53:00.480033' + > name: iris-gaia-blue-20240225 + > type: cluster-api + > kubernetes: + > cluster: + > kind: + > conf: /opt/aglais/iris-gaia-blue-20240225-kind.yml + > debug: + > created: '2024-02-25 23:57:08.436498' + > name: iris-gaia-blue-20240225-kind + > work: + > conf: /opt/aglais/iris-gaia-blue-20240225-work.yml + > debug: + > created: '2024-02-26 00:01:05.355638' + > name: iris-gaia-blue-20240225-work + > version: 1.26.7 + > openstack: + > cloud: + > name: iris-gaia-blue + > site: cambridge-arcus + > keypair: + > fingerprint: 2e:84:98:98:df:70:06:0e:4c:ed:bd:d4:d6:6b:eb:16 + > id: iris-gaia-blue-20240225-keypair + > name: iris-gaia-blue-20240225-keypair + > networks: + > bootstrap: + > network: + > id: 861ff3af-ebab-441e-b8cb-f06c4eb063e6 + > name: iris-gaia-blue-20240225-bootstrap-network + > router: + > id: 4ff6f93a-a2fc-467d-a0fe-5b1e2120001e + > name: iris-gaia-blue-20240225-bootstrap-network-router + > subnet: + > cidr: 10.10.0.0/16 + > id: 5534cf77-2348-4a76-b71c-52dfae396a53 + > name: iris-gaia-blue-20240225-bootstrap-network-subnet + > external: + > network: + > id: 57add367-d205-4030-a929-d75617a7c63e + > name: CUDN-Internet + > project: + > id: e918a13fed2648758175a15fac083569, + > name: iris-gaia-blue + > servers: + > bootstrap: + > float: + > external: 128.232.226.171 + > id: 18221aa0-b821-40a8-ac56-5ab3cc2354c9 + > internal: 10.10.1.156 + > server: + > address: + > ipv4: 10.10.1.156 + > flavor: + > name: gaia.vm.cclake.2vcpu + > hostname: bootstrap + > id: fa38229b-f9d3-439c-b8d0-580d9888a37c + > image: + > id: 0d32b1a9-c034-47ef-88d6-ad1a9ba0b91c + > name: gaia-dmp-fedora-cloud-38-1.6 + > name: iris-gaia-blue-20240225-bootstrap-node + > user: + > id: 5fa0c97a6dd14e01a3c7d91dad5c6b17, + > name: dmorris_gaia + + +# ----------------------------------------------------- +# Check the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/iris-gaia-blue-20240225-work False Warning ScalingUp 4m5s Scaling up control plane to 1 replicas (actual 0) + > ├─ClusterInfrastructure - OpenStackCluster/iris-gaia-blue-20240225-work + > ├─ControlPlane - KubeadmControlPlane/iris-gaia-blue-20240225-work-control-plane False Warning ScalingUp 4m5s Scaling up control plane to 1 replicas (actual 0) + > │ └─Machine/iris-gaia-blue-20240225-work-control-plane-pbrlr False Info WaitingForBootstrapData 2m26s 1 of 2 completed + > └─Workers + > └─MachineDeployment/iris-gaia-blue-20240225-work-md-0 False Warning WaitingForAvailableMachines 4m6s Minimum availability requires 2 replicas, current 0 available + > └─3 Machines... False Info WaitingForBootstrapData 2m28s See iris-gaia-blue-20240225-work-md-0-bkzv2-6drwq, iris-gaia-blue-20240225-work-md-0-bkzv2-fjpjr, ... + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/iris-gaia-blue-20240225-work False Warning NodeStartupTimeout @ /iris-gaia-blue-20240225-work-control-plane-pbrlr 8m12s Node failed to report startup in 10m0s + > ├─ClusterInfrastructure - OpenStackCluster/iris-gaia-blue-20240225-work + > ├─ControlPlane - KubeadmControlPlane/iris-gaia-blue-20240225-work-control-plane False Warning NodeStartupTimeout @ /iris-gaia-blue-20240225-work-control-plane-pbrlr 8m12s Node failed to report startup in 10m0s + > │ └─Machine/iris-gaia-blue-20240225-work-control-plane-pbrlr False Warning NodeStartupTimeout 8m12s Node failed to report startup in 10m0s + > └─Workers + > └─MachineDeployment/iris-gaia-blue-20240225-work-md-0 False Warning WaitingForAvailableMachines 23m Minimum availability requires 2 replicas, current 0 available + > └─3 Machines... True 7m34s See iris-gaia-blue-20240225-work-md-0-bkzv2-d22qx, iris-gaia-blue-20240225-work-md-0-bkzv2-ldd4h, ... + > Connection to bootstrap closed. + + # + # After 10 min it starts to delete and create new nodes. + # but all of them fail to callback to healthcheck endpoints + # + # So K8s is broken on Arcus. + # + diff --git a/notes/zrq/20240227-01-arcus-tests.txt b/notes/zrq/20240227-01-arcus-tests.txt new file mode 100644 index 00000000..71ad1f78 --- /dev/null +++ b/notes/zrq/20240227-01-arcus-tests.txt @@ -0,0 +1,193 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Test the connection issues on Arcus have been fixed. + https://github.com/wfau/gaia-dmp/issues/1308 + https://ucam-rcs.atlassian.net/servicedesk/customer/portal/4/HPCSSUP-67058 + + Result: + + Zeppelin VMs in each of the clouds can reach the 'gitstore' VM in the data project. + Zeppelin VMs in each of the clouds can access the tar.gz file in Swift Object Store. + Looks like the routing issue is fixed. + + TODO + Login to the Cambridge HPC Atlasian and close the ticket. + https://github.com/wfau/gaia-dmp/issues/1308 + https://ucam-rcs.atlassian.net/servicedesk/customer/portal/4/HPCSSUP-67058 + + +# ----------------------------------------------------- +# Login to each deployment and check connectivity. +#[user@laptop] + + # + # Getting the IP addresses from Horizon UI because builds on red + # and blue haven't completed yet so the DNS record hasn't been set. + # + + red 128.232.227.103 + green 128.232.227.27 + blue 128.232.226.211 + + + ssh fedora@128.232.227.103 \ + ' + date + hostname + ' + + > Tue Feb 27 12:10:28 PM UTC 2024 + > iris-gaia-red-20240227-zeppelin + + + ssh fedora@128.232.227.103 \ + ' + date + hostname + echo "----" + ssh data.gaia-dmp.uk "date ; hostname" + ' + + > Tue Feb 27 12:11:18 PM UTC 2024 + > iris-gaia-red-20240227-zeppelin + > ---- + > Tue 27 Feb 2024 12:11:20 PM UTC + > iris-gaia-data-20220411-gitstore + + + ssh fedora@128.232.227.103 \ + ' + date + hostname + echo "----" + curl --silent --head 'https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_e216e6b502134b6185380be6ccd0bf09/archive/zeppelin-0.10.1-gaia-dmp-0.1.tar.gz' + ' + + > Tue Feb 27 12:12:45 PM UTC 2024 + > iris-gaia-red-20240227-zeppelin + > ---- + > HTTP/1.1 200 OK + > Content-Length: 1716996866 + > Accept-Ranges: bytes + > Last-Modified: Mon, 20 Feb 2023 20:14:01 GMT + > .... + > .... + + + ssh fedora@green.gaia-dmp.uk \ + ' + date + hostname + ' + + > Tue 27 Feb 12:14:21 UTC 2024 + > iris-gaia-green-20231027-zeppelin + + + ssh fedora@green.gaia-dmp.uk \ + ' + date + hostname + echo "----" + ssh data.gaia-dmp.uk "date ; hostname" + ' + + > Tue 27 Feb 12:14:37 UTC 2024 + > iris-gaia-green-20231027-zeppelin + > ---- + > Tue 27 Feb 12:14:38 UTC 2024 + > iris-gaia-data-20220411-gitstore + + + ssh fedora@green.gaia-dmp.uk \ + ' + date + hostname + echo "----" + curl --silent --head 'https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_e216e6b502134b6185380be6ccd0bf09/archive/zeppelin-0.10.1-gaia-dmp-0.1.tar.gz' + ' + + > Tue 27 Feb 12:14:55 UTC 2024 + > iris-gaia-green-20231027-zeppelin + > ---- + > HTTP/1.1 200 OK + > Content-Length: 1716996866 + > Accept-Ranges: bytes + > Last-Modified: Mon, 20 Feb 2023 20:14:01 GMT + > .... + > .... + + + ssh fedora@128.232.226.211 \ + ' + date + hostname + ' + + > Tue Feb 27 12:16:21 PM UTC 2024 + > iris-gaia-blue-20240227-zeppelin + + + ssh fedora@128.232.226.211 \ + ' + date + hostname + echo "----" + ssh data.gaia-dmp.uk "date ; hostname" + ' + + > Tue Feb 27 12:17:26 PM UTC 2024 + > iris-gaia-blue-20240227-zeppelin + > ---- + > Tue 27 Feb 2024 12:17:27 PM UTC + > iris-gaia-data-20220411-gitstore + + + ssh fedora@128.232.226.211 \ + ' + date + hostname + echo "----" + curl --silent --head 'https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_e216e6b502134b6185380be6ccd0bf09/archive/zeppelin-0.10.1-gaia-dmp-0.1.tar.gz' + ' + + > Tue Feb 27 12:17:47 PM UTC 2024 + > iris-gaia-blue-20240227-zeppelin + > ---- + > HTTP/1.1 200 OK + > Content-Length: 1716996866 + > Accept-Ranges: bytes + > Last-Modified: Mon, 20 Feb 2023 20:14:01 GMT + > .... + > .... + + +