diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 2b0f1cec788..99f29ee258a 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -5,7 +5,7 @@ tests/smoke_tests ├── test_*.py -> release pipeline -├── test_pre_merge.py -> pre-merge pipeline +├── test_quick_tests_core.py -> run quick tests on PR before merging run `PYTHONPATH=$(pwd)/tests:$PYTHONPATH python .buildkite/generate_pipeline.py` to generate the pipeline for testing. The CI will run this script as a pre-step, @@ -208,8 +208,8 @@ def _convert_release(test_files: List[str]): extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP}) -def _convert_pre_merge(test_files: List[str]): - yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml' +def _convert_quick_tests_core(test_files: List[str]): + yaml_file_path = '.buildkite/pipeline_smoke_tests_quick_tests_core.yaml' output_file_pipelines = [] for test_file in test_files: print(f'Converting {test_file} to {yaml_file_path}') @@ -234,18 +234,18 @@ def _convert_pre_merge(test_files: List[str]): def main(): test_files = os.listdir('tests/smoke_tests') release_files = [] - pre_merge_files = [] + quick_tests_core_files = [] for test_file in test_files: if not test_file.startswith('test_'): continue test_file_path = os.path.join('tests/smoke_tests', test_file) - if "test_pre_merge" in test_file: - pre_merge_files.append(test_file_path) + if "test_quick_tests_core" in test_file: + quick_tests_core_files.append(test_file_path) else: release_files.append(test_file_path) _convert_release(release_files) - _convert_pre_merge(pre_merge_files) + _convert_quick_tests_core(quick_tests_core_files) if __name__ == '__main__': diff --git a/README.md b/README.md index f29b57be9ca..1ed99325df5 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@

- + Documentation @@ -43,7 +43,7 @@

Archived - [Jul 2024] [**Finetune**](./llm/llama-3_1-finetuning/) and [**serve**](./llm/llama-3_1/) **Llama 3.1** on your infra -- [Apr 2024] Serve and finetune [**Llama 3**](https://skypilot.readthedocs.io/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/) +- [Apr 2024] Serve and finetune [**Llama 3**](https://docs.skypilot.co/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/) - [Mar 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/) - [Feb 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/) - [Dec 2023] Using [**LoRAX**](https://github.com/predibase/lorax) to serve 1000s of finetuned LLMs on a single instance in the cloud: [**example**](./llm/lorax/) @@ -60,17 +60,17 @@ SkyPilot is a framework for running AI and batch workloads on any infra, offering unified execution, high cost savings, and high GPU availability. SkyPilot **abstracts away infra burdens**: -- Launch [dev clusters](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html), [jobs](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html), and [serving](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) on any infra +- Launch [dev clusters](https://docs.skypilot.co/en/latest/examples/interactive-development.html), [jobs](https://docs.skypilot.co/en/latest/examples/managed-jobs.html), and [serving](https://docs.skypilot.co/en/latest/serving/sky-serve.html) on any infra - Easy job management: queue, run, and auto-recover many jobs SkyPilot **supports multiple clusters, clouds, and hardware** ([the Sky](https://arxiv.org/abs/2205.07147)): - Bring your reserved GPUs, Kubernetes clusters, or 12+ clouds -- [Flexible provisioning](https://skypilot.readthedocs.io/en/latest/examples/auto-failover.html) of GPUs, TPUs, CPUs, with auto-retry +- [Flexible provisioning](https://docs.skypilot.co/en/latest/examples/auto-failover.html) of GPUs, TPUs, CPUs, with auto-retry SkyPilot **cuts your cloud costs & maximizes GPU availability**: -* [Autostop](https://skypilot.readthedocs.io/en/latest/reference/auto-stop.html): automatic cleanup of idle resources -* [Managed Spot](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html): 3-6x cost savings using spot instances, with preemption auto-recovery -* [Optimizer](https://skypilot.readthedocs.io/en/latest/examples/auto-failover.html): 2x cost savings by auto-picking the cheapest & most available infra +* [Autostop](https://docs.skypilot.co/en/latest/reference/auto-stop.html): automatic cleanup of idle resources +* [Managed Spot](https://docs.skypilot.co/en/latest/examples/managed-jobs.html): 3-6x cost savings using spot instances, with preemption auto-recovery +* [Optimizer](https://docs.skypilot.co/en/latest/examples/auto-failover.html): 2x cost savings by auto-picking the cheapest & most available infra SkyPilot supports your existing GPU, TPU, and CPU workloads, with no code changes. @@ -79,13 +79,13 @@ Install with pip: # Choose your clouds: pip install -U "skypilot[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp]" ``` -To get the latest features and fixes, use the nightly build or [install from source](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html): +To get the latest features and fixes, use the nightly build or [install from source](https://docs.skypilot.co/en/latest/getting-started/installation.html): ```bash # Choose your clouds: pip install "skypilot-nightly[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp]" ``` -[Current supported infra](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) (Kubernetes; AWS, GCP, Azure, OCI, Lambda Cloud, Fluidstack, RunPod, Cudo, Paperspace, Cloudflare, Samsung, IBM, VMware vSphere): +[Current supported infra](https://docs.skypilot.co/en/latest/getting-started/installation.html) (Kubernetes; AWS, GCP, Azure, OCI, Lambda Cloud, Fluidstack, RunPod, Cudo, Paperspace, Cloudflare, Samsung, IBM, VMware vSphere):

@@ -95,16 +95,16 @@ pip install "skypilot-nightly[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidst ## Getting Started -You can find our documentation [here](https://skypilot.readthedocs.io/en/latest/). -- [Installation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) -- [Quickstart](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html) -- [CLI reference](https://skypilot.readthedocs.io/en/latest/reference/cli.html) +You can find our documentation [here](https://docs.skypilot.co/). +- [Installation](https://docs.skypilot.co/en/latest/getting-started/installation.html) +- [Quickstart](https://docs.skypilot.co/en/latest/getting-started/quickstart.html) +- [CLI reference](https://docs.skypilot.co/en/latest/reference/cli.html) ## SkyPilot in 1 Minute A SkyPilot task specifies: resource requirements, data to be synced, setup commands, and the task commands. -Once written in this [**unified interface**](https://skypilot.readthedocs.io/en/latest/reference/yaml-spec.html) (YAML or Python API), the task can be launched on any available cloud. This avoids vendor lock-in, and allows easily moving jobs to a different provider. +Once written in this [**unified interface**](https://docs.skypilot.co/en/latest/reference/yaml-spec.html) (YAML or Python API), the task can be launched on any available cloud. This avoids vendor lock-in, and allows easily moving jobs to a different provider. Paste the following into a file `my_task.yaml`: @@ -135,7 +135,7 @@ Prepare the workdir by cloning: git clone https://github.com/pytorch/examples.git ~/torch_examples ``` -Launch with `sky launch` (note: [access to GPU instances](https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html) is needed for this example): +Launch with `sky launch` (note: [access to GPU instances](https://docs.skypilot.co/en/latest/cloud-setup/quota.html) is needed for this example): ```bash sky launch my_task.yaml ``` @@ -152,10 +152,10 @@ SkyPilot then performs the heavy-lifting for you, including:

-Refer to [Quickstart](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html) to get started with SkyPilot. +Refer to [Quickstart](https://docs.skypilot.co/en/latest/getting-started/quickstart.html) to get started with SkyPilot. ## More Information -To learn more, see [Concept: Sky Computing](https://docs.skypilot.co/en/latest/sky-computing.html), [SkyPilot docs](https://skypilot.readthedocs.io/en/latest/), and [SkyPilot blog](https://blog.skypilot.co/). +To learn more, see [Concept: Sky Computing](https://docs.skypilot.co/en/latest/sky-computing.html), [SkyPilot docs](https://docs.skypilot.co/en/latest/), and [SkyPilot blog](https://blog.skypilot.co/). Runnable examples: diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 7627218e451..9bc7052771f 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -10,6 +10,7 @@ myst-parser==2.0.0 sphinx-autodoc-typehints==1.25.2 sphinx-book-theme==1.1.0 sphinx-togglebutton==0.3.2 +sphinx-notfound-page==1.0.4 sphinxcontrib-applehelp==1.0.7 sphinxcontrib-devhelp==1.0.5 sphinxcontrib-googleanalytics==0.4 diff --git a/docs/source/_static/SkyPilot_wide_dark.svg b/docs/source/_static/SkyPilot_wide_dark.svg index 6be00d9e591..cb2f742ab98 100644 --- a/docs/source/_static/SkyPilot_wide_dark.svg +++ b/docs/source/_static/SkyPilot_wide_dark.svg @@ -1,64 +1,54 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/SkyPilot_wide_light.svg b/docs/source/_static/SkyPilot_wide_light.svg index 0b2eaae8538..71945c0f927 100644 --- a/docs/source/_static/SkyPilot_wide_light.svg +++ b/docs/source/_static/SkyPilot_wide_light.svg @@ -1,64 +1,55 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css index d5bbdd6cb51..aae9defea90 100644 --- a/docs/source/_static/custom.css +++ b/docs/source/_static/custom.css @@ -27,6 +27,7 @@ html[data-theme="light"] { --pst-color-primary: #176de8; --pst-color-secondary: var(--pst-color-primary); --pst-color-text-base: #4c4c4d; + --logo-text-color: #0E2E65; } html[data-theme="dark"] { @@ -34,6 +35,7 @@ html[data-theme="dark"] { --pst-color-primary: #176de8; --pst-color-secondary: var(--pst-color-primary); --pst-color-text-base: #d8d8d8; + --logo-text-color: #D8D8D8; .bd-sidebar::-webkit-scrollbar { width: 6px; diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 5ae47b7b7be..0c35532caf3 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -25,14 +25,9 @@ document.addEventListener('DOMContentLoaded', function () { document.addEventListener('DOMContentLoaded', () => { // New items: const newItems = [ - { selector: '.toctree-l1 > a', text: 'Managed Jobs' }, - { selector: '.toctree-l1 > a', text: 'Pixtral (Mistral AI)' }, { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' }, - { selector: '.toctree-l1 > a', text: 'Reserved, Capacity Blocks, DWS' }, - { selector: '.toctree-l1 > a', text: 'Llama 3.2 (Meta)' }, { selector: '.toctree-l1 > a', text: 'Admin Policy Enforcement' }, { selector: '.toctree-l1 > a', text: 'Using Existing Machines' }, - { selector: '.toctree-l1 > a', text: 'Concept: Sky Computing' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { diff --git a/docs/source/_templates/navbar-skypilot-logo.html b/docs/source/_templates/navbar-skypilot-logo.html index 0323953acde..1692f1f2a5d 100644 --- a/docs/source/_templates/navbar-skypilot-logo.html +++ b/docs/source/_templates/navbar-skypilot-logo.html @@ -9,5 +9,59 @@ {#- Logo HTML and image #} diff --git a/docs/source/conf.py b/docs/source/conf.py index a8ce3270e88..3c0b62c9947 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -41,6 +41,7 @@ 'sphinxemoji.sphinxemoji', 'sphinx_design', 'myst_parser', + 'notfound.extension', ] intersphinx_mapping = { diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst index 61c33b5c43e..99fa461249d 100644 --- a/docs/source/examples/managed-jobs.rst +++ b/docs/source/examples/managed-jobs.rst @@ -499,7 +499,7 @@ To achieve the above, you can specify custom configs in :code:`~/.sky/config.yam # Specify the disk_size in GB of the jobs controller. disk_size: 100 -The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. +The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. .. note:: These settings will not take effect if you have an existing controller (either diff --git a/docs/source/images/skypilot-wide-dark-1k.png b/docs/source/images/skypilot-wide-dark-1k.png index 057b6a0ae97..b6ed7caec6f 100644 Binary files a/docs/source/images/skypilot-wide-dark-1k.png and b/docs/source/images/skypilot-wide-dark-1k.png differ diff --git a/docs/source/images/skypilot-wide-light-1k.png b/docs/source/images/skypilot-wide-light-1k.png index 7af87ad2864..178c6553dd3 100644 Binary files a/docs/source/images/skypilot-wide-light-1k.png and b/docs/source/images/skypilot-wide-light-1k.png differ diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 286788625bd..d5ee4d2134a 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -22,7 +22,7 @@ Available fields and semantics: # # These take effects only when a managed jobs controller does not already exist. # - # Ref: https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#customizing-job-controller-resources + # Ref: https://docs.skypilot.co/en/latest/examples/managed-jobs.html#customizing-job-controller-resources jobs: controller: resources: # same spec as 'resources' in a task YAML @@ -478,13 +478,13 @@ Available fields and semantics: # This must be either: 'loadbalancer', 'ingress' or 'podip'. # # loadbalancer: Creates services of type `LoadBalancer` to expose ports. - # See https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#loadbalancer-service. + # See https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html#loadbalancer-service. # This mode is supported out of the box on most cloud managed Kubernetes # environments (e.g., GKE, EKS). # # ingress: Creates an ingress and a ClusterIP service for each port opened. # Requires an Nginx ingress controller to be configured on the Kubernetes cluster. - # Refer to https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#nginx-ingress + # Refer to https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html#nginx-ingress # for details on deploying the NGINX ingress controller. # # podip: Directly returns the IP address of the pod. This mode does not @@ -513,7 +513,7 @@ Available fields and semantics: # # : The name of a service account to use for all Kubernetes pods. # This service account must exist in the user's namespace and have all - # necessary permissions. Refer to https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/kubernetes.html + # necessary permissions. Refer to https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/kubernetes.html # for details on the roles required by the service account. # # Using SERVICE_ACCOUNT or a custom service account only affects Kubernetes @@ -581,7 +581,7 @@ Available fields and semantics: # gke: uses cloud.google.com/gke-accelerator label to identify GPUs on nodes # karpenter: uses karpenter.k8s.aws/instance-gpu-name label to identify GPUs on nodes # generic: uses skypilot.co/accelerator labels to identify GPUs on nodes - # Refer to https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#setting-up-gpu-support + # Refer to https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html#setting-up-gpu-support # for more details on setting up labels for GPU support. # # Default: null (no autoscaler, autodetect label format for GPU nodes) diff --git a/docs/source/reference/kubernetes/index.rst b/docs/source/reference/kubernetes/index.rst index 89a57862c88..639b5b633ed 100644 --- a/docs/source/reference/kubernetes/index.rst +++ b/docs/source/reference/kubernetes/index.rst @@ -39,7 +39,7 @@ Why use SkyPilot on Kubernetes? .. grid-item-card:: 🖼 Run popular models on Kubernetes :text-align: center - Train and serve `Llama-3 `_, `Mixtral `_, and more on your Kubernetes with ready-to-use recipes from the :ref:`AI gallery `. + Train and serve `Llama-3 `_, `Mixtral `_, and more on your Kubernetes with ready-to-use recipes from the :ref:`AI gallery `. .. tab-item:: For Infrastructure Admins diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index 455ee5909c9..0be708305c8 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -23,7 +23,7 @@ Available fields: # which `sky` is called. # # To exclude files from syncing, see - # https://skypilot.readthedocs.io/en/latest/examples/syncing-code-artifacts.html#exclude-uploading-files + # https://docs.skypilot.co/en/latest/examples/syncing-code-artifacts.html#exclude-uploading-files workdir: ~/my-task-code # Number of nodes (optional; defaults to 1) to launch including the head node. @@ -357,7 +357,7 @@ In additional to the above fields, SkyPilot also supports the following experime # # The following fields can be overridden. Please refer to docs of Advanced # Configuration for more details of those fields: - # https://skypilot.readthedocs.io/en/latest/reference/config.html + # https://docs.skypilot.co/en/latest/reference/config.html config_overrides: docker: run_options: ... diff --git a/docs/source/reservations/existing-machines.rst b/docs/source/reservations/existing-machines.rst index 10962ecd639..717043bfd25 100644 --- a/docs/source/reservations/existing-machines.rst +++ b/docs/source/reservations/existing-machines.rst @@ -42,7 +42,7 @@ Prerequisites **Local machine (typically your laptop):** * `kubectl `_ -* `SkyPilot `_ +* `SkyPilot `_ **Remote machines (your cluster, optionally with GPUs):** diff --git a/docs/source/serving/sky-serve.rst b/docs/source/serving/sky-serve.rst index c00fa427bd6..693102c0550 100644 --- a/docs/source/serving/sky-serve.rst +++ b/docs/source/serving/sky-serve.rst @@ -242,6 +242,9 @@ Under the hood, :code:`sky serve up`: #. Meanwhile, the controller provisions replica VMs which later run the services; #. Once any replica is ready, the requests sent to the Service Endpoint will be distributed to one of the endpoint replicas. +.. note:: + SkyServe uses least load load balancing to distribute the traffic to the replicas. It keeps track of the number of requests each replica has handled and routes the next request to the replica with the least load. + After the controller is provisioned, you'll see the following in :code:`sky serve status` output: .. image:: ../images/sky-serve-status-output-provisioning.png @@ -515,7 +518,7 @@ To achieve the above, you can specify custom configs in :code:`~/.sky/config.yam # Specify the disk_size in GB of the SkyServe controller. disk_size: 1024 -The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. +The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. .. note:: These settings will not take effect if you have an existing controller (either diff --git a/examples/airflow/shared_state/README.md b/examples/airflow/shared_state/README.md index 5f39471351a..917a45862a7 100644 --- a/examples/airflow/shared_state/README.md +++ b/examples/airflow/shared_state/README.md @@ -12,7 +12,7 @@ In this guide, we demonstrate how some simple SkyPilot operations, such as launc * Airflow installed on a [Kubernetes cluster](https://airflow.apache.org/docs/helm-chart/stable/index.html) or [locally](https://airflow.apache.org/docs/apache-airflow/stable/start.html) (`SequentialExecutor`) * A Kubernetes cluster to run tasks on. We'll use GKE in this example. - * You can use our guide on [setting up a Kubernetes cluster](https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html). + * You can use our guide on [setting up a Kubernetes cluster](https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html). * A persistent volume storage class should be available that supports at least `ReadWriteOnce` access mode. GKE has this supported by default. ## Preparing the Kubernetes Cluster @@ -39,7 +39,7 @@ In this guide, we demonstrate how some simple SkyPilot operations, such as launc name: sky-airflow-sa namespace: default roleRef: - # For minimal permissions, refer to https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/kubernetes.html + # For minimal permissions, refer to https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/kubernetes.html kind: ClusterRole name: cluster-admin apiGroup: rbac.authorization.k8s.io @@ -163,7 +163,7 @@ with DAG(dag_id='sky_k8s_example', ## Tips 1. **Persistent Volume**: If you have many concurrent tasks, you may want to use a storage class that supports [`ReadWriteMany`](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) access mode. -2. **Cloud credentials**: If you wish to run tasks on different clouds, you can configure cloud credentials in Kubernetes secrets and mount them in the Sky pod defined in the DAG. See [SkyPilot docs on setting up cloud credentials](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloud-account-setup) for more on how to configure credentials in the pod. +2. **Cloud credentials**: If you wish to run tasks on different clouds, you can configure cloud credentials in Kubernetes secrets and mount them in the Sky pod defined in the DAG. See [SkyPilot docs on setting up cloud credentials](https://docs.skypilot.co/en/latest/getting-started/installation.html#cloud-account-setup) for more on how to configure credentials in the pod. 3. **Logging**: All SkyPilot logs are written to container stdout, which is captured as task logs in Airflow and displayed in the UI. You can also write logs to a file and read them in subsequent tasks. 4. **XComs for shared state**: Airflow also provides [XComs](https://airflow.apache.org/docs/apache-airflow/stable/concepts/xcoms.html) for cross-task communication. [`sky_k8s_example_xcoms.py`](sky_k8s_example_xcoms.py) demonstrates how to use XComs to share state between tasks. diff --git a/examples/airflow/training_workflow/README.md b/examples/airflow/training_workflow/README.md index dad08d8d3b0..71cb10bef50 100644 --- a/examples/airflow/training_workflow/README.md +++ b/examples/airflow/training_workflow/README.md @@ -7,7 +7,7 @@ In this guide, we show how a training workflow involving data preprocessing, tra

-**💡 Tip:** SkyPilot also supports defining and running pipelines without Airflow. Check out [Jobs Pipelines](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#job-pipelines) for more information. +**💡 Tip:** SkyPilot also supports defining and running pipelines without Airflow. Check out [Jobs Pipelines](https://docs.skypilot.co/en/latest/examples/managed-jobs.html#job-pipelines) for more information. ## Why use SkyPilot with Airflow? In AI workflows, **the transition from development to production is hard**. @@ -24,7 +24,7 @@ production Airflow cluster. Behind the scenes, SkyPilot handles environment setu Here's how you can use SkyPilot to take your dev workflows to production in Airflow: 1. **Define and test your workflow as SkyPilot tasks**. - - Use `sky launch` and [Sky VSCode integration](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html#dev-vscode) to run, debug and iterate on your code. + - Use `sky launch` and [Sky VSCode integration](https://docs.skypilot.co/en/latest/examples/interactive-development.html#dev-vscode) to run, debug and iterate on your code. 2. **Orchestrate SkyPilot tasks in Airflow** by invoking `sky launch` on their YAMLs as a task in the Airflow DAG. - Airflow does the scheduling, logging, and monitoring, while SkyPilot handles the infra setup and task execution. @@ -34,7 +34,7 @@ Here's how you can use SkyPilot to take your dev workflows to production in Airf * Airflow installed on a [Kubernetes cluster](https://airflow.apache.org/docs/helm-chart/stable/index.html) or [locally](https://airflow.apache.org/docs/apache-airflow/stable/start.html) (`SequentialExecutor`) * A Kubernetes cluster to run tasks on. We'll use GKE in this example. * A Google cloud account with GCS access to store the data for task. - * Follow [SkyPilot instructions](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp) to set up Google Cloud credentials. + * Follow [SkyPilot instructions](https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp) to set up Google Cloud credentials. ## Preparing the Kubernetes Cluster @@ -60,7 +60,7 @@ Here's how you can use SkyPilot to take your dev workflows to production in Airf name: sky-airflow-sa namespace: default roleRef: - # For minimal permissions, refer to https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/kubernetes.html + # For minimal permissions, refer to https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/kubernetes.html kind: ClusterRole name: cluster-admin apiGroup: rbac.authorization.k8s.io @@ -103,7 +103,7 @@ The train and eval step can be run in a similar way: sky launch -c train --env DATA_BUCKET_URL=gs:// train.yaml ``` -Hint: You can use `ssh` and VSCode to [interactively develop](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html) and debug the tasks. +Hint: You can use `ssh` and VSCode to [interactively develop](https://docs.skypilot.co/en/latest/examples/interactive-development.html) and debug the tasks. Note: `eval` can be optionally run on the same cluster as `train` with `sky exec`. Refer to the `shared_state` airflow example on how to do this. diff --git a/examples/cog/README.md b/examples/cog/README.md index b2193e2e18f..97d886e2d2c 100644 --- a/examples/cog/README.md +++ b/examples/cog/README.md @@ -17,7 +17,7 @@ curl http://$IP:5000/predictions -X POST \ ``` ## Scale up the deployment using SkyServe -We can use SkyServe (`sky serve`) to scale up the deployment to multiple instances, while enjoying load balancing, autoscaling, and other [SkyServe features](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +We can use SkyServe (`sky serve`) to scale up the deployment to multiple instances, while enjoying load balancing, autoscaling, and other [SkyServe features](https://docs.skypilot.co/en/latest/serving/sky-serve.html). ```console sky serve up -n cog ./sky.yaml ``` diff --git a/examples/distributed-pytorch/README.md b/examples/distributed-pytorch/README.md new file mode 100644 index 00000000000..6c2f7092269 --- /dev/null +++ b/examples/distributed-pytorch/README.md @@ -0,0 +1,81 @@ +# Distributed Training with PyTorch + +This example demonstrates how to run distributed training with PyTorch using SkyPilot. + +**The example is based on [PyTorch's official minGPT example](https://github.com/pytorch/examples/tree/main/distributed/minGPT-ddp)** + + +## Overview + +There are two ways to run distributed training with PyTorch: + +1. Using normal `torchrun` +2. Using `rdvz` backend + +The main difference between the two for fixed-size distributed training is that `rdvz` backend automatically handles the rank for each node, while `torchrun` requires the rank to be set manually. + +SkyPilot offers convinient built-in environment variables to help you start distributed training easily. + +### Using normal `torchrun` + + +The following command will spawn 2 nodes with 2 L4 GPU each: +``` +sky launch -c train train.yaml +``` + +In [train.yaml](./train.yaml), we use `torchrun` to launch the training and set the arguments for distributed training using [environment variables](https://docs.skypilot.co/en/latest/running-jobs/environment-variables.html#skypilot-environment-variables) provided by SkyPilot. + +```yaml +run: | + cd examples/mingpt + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --master_addr=$MASTER_ADDR \ + --master_port=8008 \ + --node_rank=${SKYPILOT_NODE_RANK} \ + main.py +``` + + + +### Using `rdzv` backend + +`rdzv` is an alternative backend for distributed training: + +``` +sky launch -c train-rdzv train-rdzv.yaml +``` + +In [train-rdzv.yaml](./train-rdzv.yaml), we use `torchrun` to launch the training and set the arguments for distributed training using [environment variables](https://docs.skypilot.co/en/latest/running-jobs/environment-variables.html#skypilot-environment-variables) provided by SkyPilot. + +```yaml +run: | + cd examples/mingpt + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + echo "Starting distributed training, head node: $MASTER_ADDR" + + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$MASTER_ADDR:29500 \ + --rdzv_id $SKYPILOT_TASK_ID \ + main.py +``` + + +## Scale up + +If you would like to scale up the training, you can simply change the resources requirement, and SkyPilot's built-in environment variables will be set automatically. + +For example, the following command will spawn 4 nodes with 4 L4 GPUs each. + +``` +sky launch -c train train.yaml --num-nodes 4 --gpus L4:4 --cpus 8+ +``` + +We increase the `--cpus` to 8+ as well to avoid the performance to be bottlenecked by the CPU. + diff --git a/examples/distributed-pytorch/train-rdzv.yaml b/examples/distributed-pytorch/train-rdzv.yaml new file mode 100644 index 00000000000..3bcd63dde4c --- /dev/null +++ b/examples/distributed-pytorch/train-rdzv.yaml @@ -0,0 +1,29 @@ +name: minGPT-ddp-rdzv + +resources: + cpus: 4+ + accelerators: L4 + +num_nodes: 2 + +setup: | + git clone --depth 1 https://github.com/pytorch/examples || true + cd examples + git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp + # SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5). + uv pip install -r requirements.txt "numpy<2" "torch==1.12.1+cu113" --extra-index-url https://download.pytorch.org/whl/cu113 + +run: | + cd examples/mingpt + export LOGLEVEL=INFO + + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + echo "Starting distributed training, head node: $MASTER_ADDR" + + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$MASTER_ADDR:29500 \ + --rdzv_id $SKYPILOT_TASK_ID \ + main.py diff --git a/examples/distributed-pytorch/train.yaml b/examples/distributed-pytorch/train.yaml new file mode 100644 index 00000000000..b45941e1485 --- /dev/null +++ b/examples/distributed-pytorch/train.yaml @@ -0,0 +1,29 @@ +name: minGPT-ddp + +resources: + cpus: 4+ + accelerators: L4 + +num_nodes: 2 + +setup: | + git clone --depth 1 https://github.com/pytorch/examples || true + cd examples + git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp + # SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5). + uv pip install -r requirements.txt "numpy<2" "torch==1.12.1+cu113" --extra-index-url https://download.pytorch.org/whl/cu113 + +run: | + cd examples/mingpt + export LOGLEVEL=INFO + + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + echo "Starting distributed training, head node: $MASTER_ADDR" + + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --master_addr=$MASTER_ADDR \ + --master_port=8008 \ + --node_rank=${SKYPILOT_NODE_RANK} \ + main.py diff --git a/examples/k8s_cloud_deploy/README.md b/examples/k8s_cloud_deploy/README.md index 5ba42cbe836..9b0d46249d4 100644 --- a/examples/k8s_cloud_deploy/README.md +++ b/examples/k8s_cloud_deploy/README.md @@ -56,11 +56,11 @@ NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS ## Run AI workloads on your Kubernetes cluster with SkyPilot ### Development clusters -To launch a [GPU enabled development cluster](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html), run `sky launch -c mycluster --cloud kubernetes --gpus A10:1`. +To launch a [GPU enabled development cluster](https://docs.skypilot.co/en/latest/examples/interactive-development.html), run `sky launch -c mycluster --cloud kubernetes --gpus A10:1`. SkyPilot will setup SSH config for you. -* [SSH access](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html#ssh): `ssh mycluster` -* [VSCode remote development](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html#vscode): `code --remote ssh-remote+mycluster "/"` +* [SSH access](https://docs.skypilot.co/en/latest/examples/interactive-development.html#ssh): `ssh mycluster` +* [VSCode remote development](https://docs.skypilot.co/en/latest/examples/interactive-development.html#vscode): `code --remote ssh-remote+mycluster "/"` ### Jobs @@ -87,7 +87,7 @@ sky-cmd-1-2ea4-head 1/1 Running 0 8m36s sky-jobs-controller-2ea485ea-2ea4-head 1/1 Running 0 10m ``` -Refer to [SkyPilot docs](https://skypilot.readthedocs.io/) for more. +Refer to [SkyPilot docs](https://docs.skypilot.co/) for more. ## Teardown To teardown the Kubernetes cluster, run: diff --git a/examples/serve/minimal.yaml b/examples/serve/minimal.yaml new file mode 100644 index 00000000000..c925d26f5d1 --- /dev/null +++ b/examples/serve/minimal.yaml @@ -0,0 +1,11 @@ +# An minimal example of a serve application. + +service: + readiness_probe: / + replicas: 1 + +resources: + ports: 8080 + cpus: 2+ + +run: python3 -m http.server 8080 diff --git a/examples/spot/lightning_cifar10/train.py b/examples/spot/lightning_cifar10/train.py index 0df6f18484b..14901e635ef 100644 --- a/examples/spot/lightning_cifar10/train.py +++ b/examples/spot/lightning_cifar10/train.py @@ -163,7 +163,7 @@ def main(): ) model_ckpts = glob.glob(argv.root_dir + "/*.ckpt") - if argv.resume and len(model_ckpts) > 0: + if argv.resume and model_ckpts: latest_ckpt = max(model_ckpts, key=os.path.getctime) trainer.fit(model, cifar10_dm, ckpt_path=latest_ckpt) else: diff --git a/examples/stable_diffusion/README.md b/examples/stable_diffusion/README.md index 2a4383f1347..56af44df91e 100644 --- a/examples/stable_diffusion/README.md +++ b/examples/stable_diffusion/README.md @@ -1,6 +1,6 @@ ## Setup -1. Install skypilot package by following these [instructions](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +1. Install skypilot package by following these [instructions](https://docs.skypilot.co/en/latest/getting-started/installation.html). 2. Run `git clone https://github.com/skypilot-org/skypilot.git && cd examples/stable_diffusion` diff --git a/examples/stable_diffusion/pushing_docker_image.md b/examples/stable_diffusion/pushing_docker_image.md index 80b285fa832..0585d566543 100644 --- a/examples/stable_diffusion/pushing_docker_image.md +++ b/examples/stable_diffusion/pushing_docker_image.md @@ -1,6 +1,6 @@ ## GCR -1. Install skypilot package by following these [instructions](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +1. Install skypilot package by following these [instructions](https://docs.skypilot.co/en/latest/getting-started/installation.html). 2. Run `git clone https://github.com/skypilot-org/skypilot.git `. diff --git a/llm/codellama/README.md b/llm/codellama/README.md index f145fd062ff..54019bd6d2a 100644 --- a/llm/codellama/README.md +++ b/llm/codellama/README.md @@ -38,7 +38,7 @@ The followings are the demos of Code Llama 70B hosted by SkyPilot Serve (aka Sky ## Running your own Code Llama with SkyPilot -After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), run your own Code Llama on vLLM with SkyPilot in 1-click: +After [installing SkyPilot](https://docs.skypilot.co/en/latest/getting-started/installation.html), run your own Code Llama on vLLM with SkyPilot in 1-click: 1. Start serving Code Llama 70B on a single instance with any available GPU in the list specified in [endpoint.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/codellama/endpoint.yaml) with a vLLM powered OpenAI-compatible endpoint: ```console @@ -100,7 +100,7 @@ This returns the following completion: ## Scale up the service with SkyServe -1. With [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html), a serving library built on top of SkyPilot, scaling up the Code Llama service is as simple as running: +1. With [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html), a serving library built on top of SkyPilot, scaling up the Code Llama service is as simple as running: ```bash sky serve up -n code-llama ./endpoint.yaml ``` diff --git a/llm/dbrx/README.md b/llm/dbrx/README.md index 3011af9d4e6..2845634b287 100644 --- a/llm/dbrx/README.md +++ b/llm/dbrx/README.md @@ -11,7 +11,7 @@ In this recipe, you will serve `databricks/dbrx-instruct` on your own infra -- ## Prerequisites - Go to the [HuggingFace model page](https://huggingface.co/databricks/dbrx-instruct) and request access to the model `databricks/dbrx-instruct`. -- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)). - Check that `sky check` shows clouds or Kubernetes are enabled. ## SkyPilot YAML @@ -278,6 +278,6 @@ To shut down all resources: sky serve down dbrx ``` -See more details in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). diff --git a/llm/gemma/README.md b/llm/gemma/README.md index ef5027b2807..7296f7c7e31 100644 --- a/llm/gemma/README.md +++ b/llm/gemma/README.md @@ -24,7 +24,7 @@ Generate a read-only access token on huggingface [here](https://huggingface.co/s ```bash pip install "skypilot-nightly[all]" ``` -For detailed installation instructions, please refer to the [installation guide](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +For detailed installation instructions, please refer to the [installation guide](https://docs.skypilot.co/en/latest/getting-started/installation.html). ### Host on a Single Instance diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 10fa2cf6998..b8e656e2353 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -13,7 +13,7 @@ pip install "skypilot-nightly[aws,gcp,azure,kubernetes,lambda,fluidstack]" # Cho ```bash sky check ``` -Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +Please check the instructions for enabling clouds at [SkyPilot doc](https://docs.skypilot.co/en/latest/getting-started/installation.html). 3. Download the YAML for starting the training: ```bash diff --git a/llm/llama-3/README.md b/llm/llama-3/README.md index 8ffcb3087a9..c4cf9066f63 100644 --- a/llm/llama-3/README.md +++ b/llm/llama-3/README.md @@ -29,7 +29,7 @@ ## Prerequisites - Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-70B-Instruct`. -- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)). - Check that `sky check` shows clouds or Kubernetes are enabled. ## SkyPilot YAML @@ -326,7 +326,7 @@ To shut down all resources: sky serve down llama3 ``` -See more details in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). ### **Optional**: Connect a GUI to your Llama-3 endpoint @@ -349,4 +349,4 @@ sky launch -c llama3-gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint ## Finetuning Llama-3 -You can finetune Llama-3 on your own data. We have an tutorial for finetunning Llama-2 for Vicuna on SkyPilot, which can be adapted for Llama-3. You can find the tutorial [here](https://skypilot.readthedocs.io/en/latest/gallery/tutorials/finetuning.html) and a detailed blog post [here](https://blog.skypilot.co/finetuning-llama2-operational-guide/). +You can finetune Llama-3 on your own data. We have an tutorial for finetunning Llama-2 for Vicuna on SkyPilot, which can be adapted for Llama-3. You can find the tutorial [here](https://docs.skypilot.co/en/latest/gallery/tutorials/finetuning.html) and a detailed blog post [here](https://blog.skypilot.co/finetuning-llama2-operational-guide/). diff --git a/llm/llama-3_1-finetuning/readme.md b/llm/llama-3_1-finetuning/readme.md index 935dccde84e..ddc2b9e2463 100644 --- a/llm/llama-3_1-finetuning/readme.md +++ b/llm/llama-3_1-finetuning/readme.md @@ -7,10 +7,10 @@ On July 23, 2024, Meta released the [Llama 3.1 model family](https://ai.meta.com/blog/meta-llama-3-1/), including a 405B parameter model in both base model and instruction-tuned forms. Llama 3.1 405B became _the first open LLM that closely rivals top proprietary models_ like GPT-4o and Claude 3.5 Sonnet. -This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot) and [torchtune](https://pytorch.org/torchtune/stable/index.html) to **finetune Llama 3.1 on your own data and infra**. Everything is packaged in a simple [SkyPilot YAML](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html), that can be launched with one command on your infra: +This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot) and [torchtune](https://pytorch.org/torchtune/stable/index.html) to **finetune Llama 3.1 on your own data and infra**. Everything is packaged in a simple [SkyPilot YAML](https://docs.skypilot.co/en/latest/getting-started/quickstart.html), that can be launched with one command on your infra: - Local GPU workstation - Kubernetes cluster -- Cloud accounts ([12 clouds supported](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)) +- Cloud accounts ([12 clouds supported](https://docs.skypilot.co/en/latest/getting-started/installation.html))
@@ -233,7 +233,7 @@ export HF_TOKEN="xxxx" ```bash pip install skypilot-nightly[aws,gcp,kubernetes] # or other clouds (12 clouds + kubernetes supported) you have setup -# See: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html +# See: https://docs.skypilot.co/en/latest/getting-started/installation.html ``` 5. Check your infra setup: @@ -262,6 +262,6 @@ sky check ## What's next * [AI on Kubernetes Without the Pain](https://blog.skypilot.co/ai-on-kubernetes/) -* [SkyPilot AI Gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html) -* [SkyPilot Docs](https://skypilot.readthedocs.io/en/latest/docs/index.html) +* [SkyPilot AI Gallery](https://docs.skypilot.co/en/latest/gallery/index.html) +* [SkyPilot Docs](https://docs.skypilot.co) * [SkyPilot GitHub](https://github.com/skypilot-org/skypilot) diff --git a/llm/llama-3_1/README.md b/llm/llama-3_1/README.md index 6cfeb8dc5f9..2634811d8a1 100644 --- a/llm/llama-3_1/README.md +++ b/llm/llama-3_1/README.md @@ -13,7 +13,7 @@ This guide walks through how to serve Llama 3.1 models **completely on your infr - Local GPU workstation - Kubernetes cluster -- Cloud accounts ([12 clouds supported](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)) +- Cloud accounts ([12 clouds supported](https://docs.skypilot.co/en/latest/getting-started/installation.html)) SkyPilot will be used as the unified framework to launch serving on any (or multiple) infra that you bring. @@ -64,7 +64,7 @@ sky check kubernetes sky check ``` -See [docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) for details. +See [docs](https://docs.skypilot.co/en/latest/getting-started/installation.html) for details. ### Step 1: Get a GPU dev node (pod or VM) @@ -155,7 +155,7 @@ Now that we verified the model is working, let's package it for hands-free deplo Whichever infra you use for GPUs, SkyPilot abstracts away the mundane infra tasks (e.g., setting up services on K8s, opening up ports for cloud VMs), making AI models super easy to deploy via one command. -[Deploying via SkyPilot](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) has several key benefits: +[Deploying via SkyPilot](https://docs.skypilot.co/en/latest/serving/sky-serve.html) has several key benefits: - Control node & replicas completely stay in your infra - Automatic load-balancing across multiple replicas - Automatic recovery of replicas @@ -296,7 +296,7 @@ curl -L http://$ENDPOINT/v1/chat/completions \ 🎉 **Congratulations!** You are now serving a Llama 3.1 8B model across two replicas. To recap, all model replicas **stay in your own private infrastructure** and SkyPilot ensures they are **healthy and available**. -Details on autoscaling, rolling updates, and more in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +Details on autoscaling, rolling updates, and more in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). When you are done, shut down all resources: diff --git a/llm/llama-3_2/README.md b/llm/llama-3_2/README.md index 987dc0d90c5..f6c2a54ce6a 100644 --- a/llm/llama-3_2/README.md +++ b/llm/llama-3_2/README.md @@ -26,7 +26,7 @@ ## Prerequisites - Go to the [HuggingFace model page](https://huggingface.co/meta-llama/) and request access to the model [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) and [meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision). -- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)). - Check that `sky check` shows clouds or Kubernetes are enabled. ## SkyPilot YAML @@ -346,7 +346,7 @@ To shut down all resources: sky serve down llama3 ``` -See more details in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). ## Developing and Finetuning Llama 3 series diff --git a/llm/llama-chatbots/README.md b/llm/llama-chatbots/README.md index 418d3d39d15..272cc24d288 100644 --- a/llm/llama-chatbots/README.md +++ b/llm/llama-chatbots/README.md @@ -17,12 +17,12 @@ It will automatically perform the following: [**LLaMA**](https://github.com/facebookresearch/llama) is a set of Large Language Models (LLMs) recently released by Meta. Trained on more than 1 trillion tokens from public datasets, LLaMA achieves high quality and is space-efficient. You can [fill out a form to request access from Meta](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform) to download the open model weights. In the steps below we assume either (1) you have an unexpired download URL, or (2) the weights have been downloaded and stored on the local machine. -[**SkyPilot**](https://github.com/skypilot-org/skypilot) is an open-source framework from UC Berkeley for seamlessly running machine learning on any cloud. With a simple CLI, users can easily launch many clusters and jobs, while substantially lowering their cloud bills. Currently, [Lambda Labs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#lambda-cloud) (low-cost GPU cloud), [AWS](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#aws), [GCP](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#gcp), and [Azure](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#azure) are supported. See [docs](https://skypilot.readthedocs.io/en/latest/) to learn more. +[**SkyPilot**](https://github.com/skypilot-org/skypilot) is an open-source framework from UC Berkeley for seamlessly running machine learning on any cloud. With a simple CLI, users can easily launch many clusters and jobs, while substantially lowering their cloud bills. Currently, [Lambda Labs](https://docs.skypilot.co/en/latest/getting-started/installation.html#lambda-cloud) (low-cost GPU cloud), [AWS](https://docs.skypilot.co/en/latest/getting-started/installation.html#aws), [GCP](https://docs.skypilot.co/en/latest/getting-started/installation.html#gcp), and [Azure](https://docs.skypilot.co/en/latest/getting-started/installation.html#azure) are supported. See [docs](https://docs.skypilot.co/en/latest/) to learn more. ## Steps All YAML files used below live in [the SkyPilot repo](https://github.com/skypilot-org/skypilot/tree/master/llm/llama-chatbots), and the chatbot code is [here](https://github.com/skypilot-org/sky-llama). -0. Install SkyPilot and [check that cloud credentials exist](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloud-account-setup): +0. Install SkyPilot and [check that cloud credentials exist](https://docs.skypilot.co/en/latest/getting-started/installation.html#cloud-account-setup): ```bash pip install "skypilot[aws,gcp,azure,lambda]" # pick your clouds sky check @@ -120,7 +120,7 @@ sky launch llama-30b.yaml -c llama-30b -s --env LLAMA_URL=$LLAMA_URL sky launch llama-65b.yaml -c llama-65b -s --env LLAMA_URL=$LLAMA_URL ``` -To see details about these flags, see [CLI docs](https://skypilot.readthedocs.io/en/latest/reference/cli.html) or run `sky launch -h`. +To see details about these flags, see [CLI docs](https://docs.skypilot.co/en/latest/reference/cli.html) or run `sky launch -h`. ## Cleaning up When you are done, you can stop or tear down the cluster: @@ -140,7 +140,7 @@ When you are done, you can stop or tear down the cluster: ``` **To see your clusters**, run `sky status`, which is a single pane of glass for all your clusters across regions/clouds. -To learn more about various SkyPilot commands, see [Quickstart](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html). +To learn more about various SkyPilot commands, see [Quickstart](https://docs.skypilot.co/en/latest/getting-started/quickstart.html). ## Why SkyPilot? @@ -166,12 +166,12 @@ SkyPilot's `sky launch` command makes this entirely automatic. It performs *auto - low-cost GPU cloud (Lambda; >3x cheaper than AWS/Azure/GCP) - spot instances (>3x cheaper than on-demand) - automatically choosing the cheapest cloud/region/zone -- auto-stopping & auto-termination of instances ([docs](https://skypilot.readthedocs.io/en/latest/reference/auto-stop.html)) +- auto-stopping & auto-termination of instances ([docs](https://docs.skypilot.co/en/latest/reference/auto-stop.html)) ## Recap Congratulations! You have used SkyPilot to launch a LLaMA-based chatbot on the cloud with just one command. The system automatically handles setting up instances and it offers cloud portability, higher GPU availability, and cost reduction. -LLaMA chatbots are just one example app. To leverage these benefits for your own ML projects on the cloud, we recommend the [Quickstart guide](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html). +LLaMA chatbots are just one example app. To leverage these benefits for your own ML projects on the cloud, we recommend the [Quickstart guide](https://docs.skypilot.co/en/latest/getting-started/quickstart.html). *Feedback or questions? Want to run other LLM models?* Feel free to drop a note to the SkyPilot team on [GitHub](https://github.com/skypilot-org/skypilot/) or [Slack](http://slack.skypilot.co/) and we're happy to chat! diff --git a/llm/localgpt/README.md b/llm/localgpt/README.md index 17b3332ee30..c52f1b08851 100644 --- a/llm/localgpt/README.md +++ b/llm/localgpt/README.md @@ -13,7 +13,7 @@ Install SkyPilot and check your setup of cloud credentials: pip install git+https://github.com/skypilot-org/skypilot.git sky check ``` -See [docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) for more. +See [docs](https://docs.skypilot.co/en/latest/getting-started/installation.html) for more. Once you are done, we will use [SkyPilot YAML for localGPT](https://github.com/skypilot-org/skypilot/tree/master/llm/localgpt/localgpt.yaml) to define our task and run it. diff --git a/llm/lorax/README.md b/llm/lorax/README.md index edd153d45f1..b1d5def6e78 100644 --- a/llm/lorax/README.md +++ b/llm/lorax/README.md @@ -40,7 +40,7 @@ sky launch -c lorax-cluster lorax.yaml By default, this config will deploy `Mistral-7B-Instruct`, but this can be overridden by running `sky launch` with the argument `--env MODEL_ID=`. -**NOTE:** This config will launch the instance on a public IP. It's highly recommended to secure the instance within a private subnet. See the [Advanced Configurations](https://skypilot.readthedocs.io/en/latest/reference/config.html#config-yaml) section of the SkyPilot docs for options to run within VPC and setup private IPs. +**NOTE:** This config will launch the instance on a public IP. It's highly recommended to secure the instance within a private subnet. See the [Advanced Configurations](https://docs.skypilot.co/en/latest/reference/config.html#config-yaml) section of the SkyPilot docs for options to run within VPC and setup private IPs. ## Prompt LoRAX w/ base model diff --git a/llm/mixtral/README.md b/llm/mixtral/README.md index 0bddb77c665..8456dbb5fcf 100644 --- a/llm/mixtral/README.md +++ b/llm/mixtral/README.md @@ -15,7 +15,7 @@ SkyPilot can help you serve Mixtral by automatically finding available resources sky launch -c mixtral ./serve.yaml ``` -Note that we specify the following resources, so that SkyPilot will automatically find any of the available GPUs specified by automatically [failover](https://skypilot.readthedocs.io/en/latest/examples/auto-failover.html) through all the candidates (in the order of the prices): +Note that we specify the following resources, so that SkyPilot will automatically find any of the available GPUs specified by automatically [failover](https://docs.skypilot.co/en/latest/examples/auto-failover.html) through all the candidates (in the order of the prices): ```yaml resources: @@ -82,7 +82,7 @@ curl http://$IP:8000/v1/chat/completions \ ## 2. Serve with multiple instances -When scaling up is required, [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) is the library built on top of SkyPilot, which can help you scale up the serving with multiple instances, while still providing a single endpoint. To serve Mixtral with multiple instances, run the following command: +When scaling up is required, [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html) is the library built on top of SkyPilot, which can help you scale up the serving with multiple instances, while still providing a single endpoint. To serve Mixtral with multiple instances, run the following command: ```bash sky serve up -n mixtral ./serve.yaml diff --git a/llm/ollama/README.md b/llm/ollama/README.md index 16a8a9ea8e4..2d15b598381 100644 --- a/llm/ollama/README.md +++ b/llm/ollama/README.md @@ -17,7 +17,7 @@ To get started, install the latest version of SkyPilot: pip install "skypilot-nightly[all]" ``` -For detailed installation instructions, please refer to the [installation guide](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +For detailed installation instructions, please refer to the [installation guide](https://docs.skypilot.co/en/latest/getting-started/installation.html). Once installed, run `sky check` to verify you have cloud access. @@ -296,4 +296,4 @@ To shut down all resources: sky serve down ollama ``` -See more details in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). diff --git a/llm/pixtral/README.md b/llm/pixtral/README.md index fccde1de7ad..987769c892a 100644 --- a/llm/pixtral/README.md +++ b/llm/pixtral/README.md @@ -57,7 +57,7 @@ This guide shows how to use run and deploy this multimodal model on your own clo pip install 'skypilot[all]' sky check ``` -Detailed instructions for installation and cloud setup [here](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +Detailed instructions for installation and cloud setup [here](https://docs.skypilot.co/en/latest/getting-started/installation.html). 2. Launch the model on any cloud or Kubernetes: ```bash @@ -150,7 +150,7 @@ These descriptions should give you a clear picture of the scenes depicted in the ## Scale Up Pixtral Endpoint as a Service -1. Start a service with [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html): +1. Start a service with [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html): ```bash sky serve up -n pixtral pixtral.yaml ``` diff --git a/llm/qwen/README.md b/llm/qwen/README.md index 6846fc71f2f..d4c73edb842 100644 --- a/llm/qwen/README.md +++ b/llm/qwen/README.md @@ -27,7 +27,7 @@ As of Jun 2024, Qwen1.5-110B-Chat is ranked higher than GPT-4-0613 on the [LMSYS ## Running your own Qwen with SkyPilot -After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), run your own Qwen model on vLLM with SkyPilot in 1-click: +After [installing SkyPilot](https://docs.skypilot.co/en/latest/getting-started/installation.html), run your own Qwen model on vLLM with SkyPilot in 1-click: 1. Start serving Qwen 110B on a single instance with any available GPU in the list specified in [qwen15-110b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/qwen15-110b.yaml) with a vLLM powered OpenAI-compatible endpoint (You can also switch to [qwen25-72b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/qwen25-72b.yaml) or [qwen25-7b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/qwen25-7b.yaml) for a smaller model): @@ -98,7 +98,7 @@ curl http://$ENDPOINT/v1/chat/completions \ ## Scale up the service with SkyServe -1. With [SkyPilot Serving](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html), a serving library built on top of SkyPilot, scaling up the Qwen service is as simple as running: +1. With [SkyPilot Serving](https://docs.skypilot.co/en/latest/serving/sky-serve.html), a serving library built on top of SkyPilot, scaling up the Qwen service is as simple as running: ```bash sky serve up -n qwen ./qwen25-72b.yaml ``` diff --git a/llm/sglang/README.md b/llm/sglang/README.md index 7d41b8fc168..f6bac3c71ad 100644 --- a/llm/sglang/README.md +++ b/llm/sglang/README.md @@ -21,7 +21,7 @@ sky check ``` ## Serving vision-language model LLaVA with SGLang for more traffic using SkyServe -1. Create a [`SkyServe Service YAML`](https://skypilot.readthedocs.io/en/latest/serving/service-yaml-spec.html) with a `service` section: +1. Create a [`SkyServe Service YAML`](https://docs.skypilot.co/en/latest/serving/service-yaml-spec.html) with a `service` section: ```yaml service: @@ -33,7 +33,7 @@ service: The entire Service YAML can be found here: [llava.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang/llava.yaml). -2. Start serving by using [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) CLI: +2. Start serving by using [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html) CLI: ```bash sky serve up -n sglang-llava llava.yaml ``` @@ -117,7 +117,7 @@ You should get a similar response as the following: ## Serving Llama-2 with SGLang for more traffic using SkyServe 1. The process is the same as serving LLaVA, but with the model path changed to Llama-2. Below are example commands for reference. -2. Start serving by using [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) CLI: +2. Start serving by using [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html) CLI: ```bash sky serve up -n sglang-llama2 llama2.yaml --env HF_TOKEN= ``` diff --git a/llm/tabby/README.md b/llm/tabby/README.md index 569b64538c1..9aa4ca4c803 100644 --- a/llm/tabby/README.md +++ b/llm/tabby/README.md @@ -17,13 +17,13 @@ This post shows how to use SkyPilot to host an ai coding assistant with just one - OpenAPI interface, easy to integrate with existing infrastructure (e.g Cloud IDE). - Supports consumer-grade GPUs. -[**SkyPilot**](https://github.com/skypilot-org/skypilot) is an open-source framework from UC Berkeley for seamlessly running machine learning on any cloud. With a simple CLI, users can easily launch many clusters and jobs, while substantially lowering their cloud bills. Currently, [AWS](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#aws), [GCP](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#gcp), [Azure](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#azure), [Lambda Cloud](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#lambda-cloud), [IBM](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#ibm), [Oracle Cloud Infrastructure (OCI)](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci), [Cloudflare R2](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloudflare-r2) and [Samsung Cloud Platform (SCP)](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#samsung-cloud-platform-scp) are supported. See [docs](https://skypilot.readthedocs.io/en/latest/) to learn more. +[**SkyPilot**](https://github.com/skypilot-org/skypilot) is an open-source framework from UC Berkeley for seamlessly running machine learning on any cloud. With a simple CLI, users can easily launch many clusters and jobs, while substantially lowering their cloud bills. Currently, [AWS](https://docs.skypilot.co/en/latest/getting-started/installation.html#aws), [GCP](https://docs.skypilot.co/en/latest/getting-started/installation.html#gcp), [Azure](https://docs.skypilot.co/en/latest/getting-started/installation.html#azure), [Lambda Cloud](https://docs.skypilot.co/en/latest/getting-started/installation.html#lambda-cloud), [IBM](https://docs.skypilot.co/en/latest/getting-started/installation.html#ibm), [Oracle Cloud Infrastructure (OCI)](https://docs.skypilot.co/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci), [Cloudflare R2](https://docs.skypilot.co/en/latest/getting-started/installation.html#cloudflare-r2) and [Samsung Cloud Platform (SCP)](https://docs.skypilot.co/en/latest/getting-started/installation.html#samsung-cloud-platform-scp) are supported. See [docs](https://docs.skypilot.co/en/latest/) to learn more. ## Steps All YAML files used below live in [the SkyPilot repo](https://github.com/skypilot-org/skypilot/tree/master/llm/tabby). -1. Install SkyPilot and [check that cloud credentials exist](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloud-account-setup): +1. Install SkyPilot and [check that cloud credentials exist](https://docs.skypilot.co/en/latest/getting-started/installation.html#cloud-account-setup): ```bash # pip install skypilot @@ -94,4 +94,4 @@ When you are done, you can stop or tear down the cluster: ``` **To see your clusters**, run `sky status`, which is a single pane of glass for all your clusters across regions/clouds. -To learn more about various SkyPilot commands, see [Quickstart](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html). +To learn more about various SkyPilot commands, see [Quickstart](https://docs.skypilot.co/en/latest/getting-started/quickstart.html). diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md index e392b231e64..31d78a243cb 100644 --- a/llm/vicuna-llama-2/README.md +++ b/llm/vicuna-llama-2/README.md @@ -120,7 +120,7 @@ sky launch --no-use-spot ... ### Reducing costs by 3x with spot instances -[SkyPilot Managed Jobs](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html) is a library built on top of SkyPilot that helps users run jobs on spot instances without worrying about interruptions. That is the tool used by the LMSYS organization to train the first version of Vicuna (more details can be found in their [launch blog post](https://lmsys.org/blog/2023-03-30-vicuna/) and [example](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna)). With this, the training cost can be reduced from $1000 to **\$300**. +[SkyPilot Managed Jobs](https://docs.skypilot.co/en/latest/examples/managed-jobs.html) is a library built on top of SkyPilot that helps users run jobs on spot instances without worrying about interruptions. That is the tool used by the LMSYS organization to train the first version of Vicuna (more details can be found in their [launch blog post](https://lmsys.org/blog/2023-03-30-vicuna/) and [example](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna)). With this, the training cost can be reduced from $1000 to **\$300**. To use SkyPilot Managed Spot Jobs, you can simply replace `sky launch` with `sky jobs launch` in the above command: diff --git a/llm/vicuna/README.md b/llm/vicuna/README.md index 6d9f46127d4..b8c6ab100d8 100644 --- a/llm/vicuna/README.md +++ b/llm/vicuna/README.md @@ -4,7 +4,7 @@ Vicuna LLM

-This README contains instructions to run and train Vicuna, an open-source LLM chatbot with quality comparable to ChatGPT. The Vicuna release was trained using SkyPilot on [cloud spot instances](https://skypilot.readthedocs.io/en/latest/examples/spot-jobs.html), with a cost of ~$300. +This README contains instructions to run and train Vicuna, an open-source LLM chatbot with quality comparable to ChatGPT. The Vicuna release was trained using SkyPilot on [cloud spot instances](https://docs.skypilot.co/en/latest/examples/spot-jobs.html), with a cost of ~$300. * [Blog post](https://lmsys.org/blog/2023-03-30-vicuna/) * [Demo](https://chat.lmsys.org/) diff --git a/llm/vllm/README.md b/llm/vllm/README.md index 78617f3746d..c150ae46e2d 100644 --- a/llm/vllm/README.md +++ b/llm/vllm/README.md @@ -112,7 +112,7 @@ curl http://$IP:8000/v1/chat/completions \ ## Serving Llama-2 with vLLM for more traffic using SkyServe To scale up the model serving for more traffic, we introduced SkyServe to enable a user to easily deploy multiple replica of the model: -1. Adding an `service` section in the above `serve-openai-api.yaml` file to make it an [`SkyServe Service YAML`](https://skypilot.readthedocs.io/en/latest/serving/service-yaml-spec.html): +1. Adding an `service` section in the above `serve-openai-api.yaml` file to make it an [`SkyServe Service YAML`](https://docs.skypilot.co/en/latest/serving/service-yaml-spec.html): ```yaml # The newly-added `service` section to the `serve-openai-api.yaml` file. @@ -125,7 +125,7 @@ service: The entire Service YAML can be found here: [service.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vllm/service.yaml). -2. Start serving by using [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) CLI: +2. Start serving by using [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html) CLI: ```bash sky serve up -n vllm-llama2 service.yaml ``` diff --git a/llm/yi/README.md b/llm/yi/README.md index 1353320aa9f..b9d5c4a761d 100644 --- a/llm/yi/README.md +++ b/llm/yi/README.md @@ -19,7 +19,7 @@ ## Running Yi model with SkyPilot -After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), run your own Yi model on vLLM with SkyPilot in 1-click: +After [installing SkyPilot](https://docs.skypilot.co/en/latest/getting-started/installation.html), run your own Yi model on vLLM with SkyPilot in 1-click: 1. Start serving Yi-1.5 34B on a single instance with any available GPU in the list specified in [yi15-34b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/yi/yi15-34b.yaml) with a vLLM powered OpenAI-compatible endpoint (You can also switch to [yicoder-9b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/yi/yicoder-9b.yaml) or [other model](https://github.com/skypilot-org/skypilot/tree/master/llm/yi) for a smaller model): diff --git a/sky/adaptors/cloudflare.py b/sky/adaptors/cloudflare.py index 864248614f3..e9c5613c97e 100644 --- a/sky/adaptors/cloudflare.py +++ b/sky/adaptors/cloudflare.py @@ -177,7 +177,7 @@ def check_credentials() -> Tuple[bool, Optional[str]]: hints += f'\n{_INDENT_PREFIX} $ mkdir -p ~/.cloudflare' hints += f'\n{_INDENT_PREFIX} $ echo > ~/.cloudflare/accountid' # pylint: disable=line-too-long hints += f'\n{_INDENT_PREFIX}For more info: ' - hints += 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloudflare-r2' # pylint: disable=line-too-long + hints += 'https://docs.skypilot.co/en/latest/getting-started/installation.html#cloudflare-r2' # pylint: disable=line-too-long return (False, hints) if hints else (True, hints) diff --git a/sky/adaptors/oci.py b/sky/adaptors/oci.py index 7a5fafa854a..8fe09479a38 100644 --- a/sky/adaptors/oci.py +++ b/sky/adaptors/oci.py @@ -1,9 +1,16 @@ """Oracle OCI cloud adaptor""" +import logging import os from sky.adaptors import common +# Suppress OCI circuit breaker logging before lazy import, because +# oci modules prints additional message during imports, i.e., the +# set_logger in the LazyImport called after imports will not take +# effect. +logging.getLogger('oci.circuit_breaker').setLevel(logging.WARNING) + CONFIG_PATH = '~/.oci/config' ENV_VAR_OCI_CONFIG = 'OCI_CONFIG' diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index a3651bdba9a..0333cf49602 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -173,6 +173,16 @@ ('available_node_types', 'ray.head.default', 'node_config', 'azure_arm_parameters', 'cloudInitSetupCommands'), ] +# These keys are expected to change when provisioning on an existing cluster, +# but they don't actually represent a change that requires re-provisioning the +# cluster. If the cluster yaml is the same except for these keys, we can safely +# skip reprovisioning. See _deterministic_cluster_yaml_hash. +_RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [ + # On first launch, availability_zones will include all possible zones. Once + # the cluster exists, it will only include the zone that the cluster is + # actually in. + ('provider', 'availability_zone'), +] def is_ip(s: str) -> bool: @@ -1009,10 +1019,6 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str): common_utils.dump_yaml(cluster_config_file, config) -def get_run_timestamp() -> str: - return 'sky-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') - - def get_timestamp_from_run_timestamp(run_timestamp: str) -> float: return datetime.strptime( run_timestamp.partition('-')[2], '%Y-%m-%d-%H-%M-%S-%f').timestamp() @@ -1087,7 +1093,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str: yaml file and all the files in the file mounts, then hash the byte sequence. The format of the byte sequence is: - 32 bytes - sha256 hash of the yaml file + 32 bytes - sha256 hash of the yaml for each file mount: file mount remote destination (UTF-8), \0 if the file mount source is a file: @@ -1111,14 +1117,29 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str: we construct it incrementally by using hash.update() to add new bytes. """ + # Load the yaml contents so that we can directly remove keys. + yaml_config = common_utils.read_yaml(yaml_path) + for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH: + dict_to_remove_from = yaml_config + found_key = True + for key in key_list[:-1]: + if (not isinstance(dict_to_remove_from, dict) or + key not in dict_to_remove_from): + found_key = False + break + dict_to_remove_from = dict_to_remove_from[key] + if found_key and key_list[-1] in dict_to_remove_from: + dict_to_remove_from.pop(key_list[-1]) + def _hash_file(path: str) -> bytes: return common_utils.hash_file(path, 'sha256').digest() config_hash = hashlib.sha256() - config_hash.update(_hash_file(yaml_path)) + yaml_hash = hashlib.sha256( + common_utils.dump_yaml_str(yaml_config).encode('utf-8')) + config_hash.update(yaml_hash.digest()) - yaml_config = common_utils.read_yaml(yaml_path) file_mounts = yaml_config.get('file_mounts', {}) # Remove the file mounts added by the newline. if '' in file_mounts: @@ -1126,6 +1147,11 @@ def _hash_file(path: str) -> bytes: file_mounts.pop('') for dst, src in sorted(file_mounts.items()): + if src == yaml_path: + # Skip the yaml file itself. We have already hashed a modified + # version of it. The file may include fields we don't want to hash. + continue + expanded_src = os.path.expanduser(src) config_hash.update(dst.encode('utf-8') + b'\0') diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 675e3e0ac8c..c6738778fac 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1093,7 +1093,7 @@ def _gcp_handler(blocked_resources: Set['resources_lib.Resources'], 'having the required permissions and the user ' 'account does not have enough permission to ' 'update it. Please contact your administrator and ' - 'check out: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html\n' # pylint: disable=line-too-long + 'check out: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html\n' # pylint: disable=line-too-long f'Details: {message}') _add_to_blocked_resources( blocked_resources, @@ -1390,8 +1390,7 @@ def _retry_zones( f'in {to_provision.cloud}. ' f'{colorama.Style.RESET_ALL}' f'To request quotas, check the instruction: ' - f'https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html.' # pylint: disable=line-too-long - ) + f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.') for zones in self._yield_zones(to_provision, num_nodes, cluster_name, prev_cluster_status, @@ -2601,7 +2600,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']): ResourceHandle = CloudVmRayResourceHandle # pylint: disable=invalid-name def __init__(self): - self.run_timestamp = backend_utils.get_run_timestamp() + self.run_timestamp = sky_logging.get_run_timestamp() # NOTE: do not expanduser() here, as this '~/...' path is used for # remote as well to be expanded on the remote side. self.log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, @@ -2628,7 +2627,7 @@ def register_info(self, **kwargs) -> None: self._optimize_target) or optimizer.OptimizeTarget.COST self._requested_features = kwargs.pop('requested_features', self._requested_features) - assert len(kwargs) == 0, f'Unexpected kwargs: {kwargs}' + assert not kwargs, f'Unexpected kwargs: {kwargs}' def check_resources_fit_cluster( self, diff --git a/sky/benchmark/benchmark_utils.py b/sky/benchmark/benchmark_utils.py index c9c17f00944..766b1fa9138 100644 --- a/sky/benchmark/benchmark_utils.py +++ b/sky/benchmark/benchmark_utils.py @@ -535,7 +535,7 @@ def launch_benchmark_clusters(benchmark: str, clusters: List[str], for yaml_fd, cluster in zip(yaml_fds, clusters)] # Save stdout/stderr from cluster launches. - run_timestamp = backend_utils.get_run_timestamp() + run_timestamp = sky_logging.get_run_timestamp() log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp) log_dir = os.path.expanduser(log_dir) logger.info( diff --git a/sky/check.py b/sky/check.py index dcaa349d234..1ab92cb1af6 100644 --- a/sky/check.py +++ b/sky/check.py @@ -127,7 +127,7 @@ def get_all_clouds(): '\nNote: The following clouds were disabled because they were not ' 'included in allowed_clouds in ~/.sky/config.yaml: ' f'{", ".join([c for c in disallowed_cloud_names])}') - if len(all_enabled_clouds) == 0: + if not all_enabled_clouds: echo( click.style( 'No cloud is enabled. SkyPilot will not be able to run any ' @@ -146,7 +146,7 @@ def get_all_clouds(): dim=True) + click.style(f'sky check{clouds_arg}', bold=True) + '\n' + click.style( 'If any problems remain, refer to detailed docs at: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html', # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html', # pylint: disable=line-too-long dim=True)) if disallowed_clouds_hint: diff --git a/sky/cli.py b/sky/cli.py index 1faf0003ff9..5d4f07d535f 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -114,7 +114,7 @@ def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: glob_clusters = [] for cluster in clusters: glob_cluster = global_user_state.get_glob_cluster_names(cluster) - if len(glob_cluster) == 0 and not silent: + if not glob_cluster and not silent: click.echo(f'Cluster {cluster} not found.') glob_clusters.extend(glob_cluster) return list(set(glob_clusters)) @@ -125,7 +125,7 @@ def _get_glob_storages(storages: List[str]) -> List[str]: glob_storages = [] for storage_object in storages: glob_storage = global_user_state.get_glob_storage_name(storage_object) - if len(glob_storage) == 0: + if not glob_storage: click.echo(f'Storage {storage_object} not found.') glob_storages.extend(glob_storage) return list(set(glob_storages)) @@ -830,7 +830,7 @@ class _NaturalOrderGroup(click.Group): Reference: https://github.com/pallets/click/issues/513 """ - def list_commands(self, ctx): + def list_commands(self, ctx): # pylint: disable=unused-argument return self.commands.keys() @usage_lib.entrypoint('sky.cli', fallback=True) @@ -1473,7 +1473,7 @@ def _get_services(service_names: Optional[List[str]], if len(service_records) != 1: plural = 's' if len(service_records) > 1 else '' service_num = (str(len(service_records)) - if len(service_records) > 0 else 'No') + if service_records else 'No') raise click.UsageError( f'{service_num} service{plural} found. Please specify ' 'an existing service to show its endpoint. Usage: ' @@ -1696,8 +1696,7 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, if len(clusters) != 1: with ux_utils.print_exception_no_traceback(): plural = 's' if len(clusters) > 1 else '' - cluster_num = (str(len(clusters)) - if len(clusters) > 0 else 'No') + cluster_num = (str(len(clusters)) if clusters else 'No') cause = 'a single' if len(clusters) > 1 else 'an existing' raise ValueError( _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format( @@ -1722,9 +1721,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, with ux_utils.print_exception_no_traceback(): plural = 's' if len(cluster_records) > 1 else '' cluster_num = (str(len(cluster_records)) - if len(cluster_records) > 0 else - f'{clusters[0]!r}') - verb = 'found' if len(cluster_records) > 0 else 'not found' + if cluster_records else f'{clusters[0]!r}') + verb = 'found' if cluster_records else 'not found' cause = 'a single' if len(clusters) > 1 else 'an existing' raise ValueError( _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format( @@ -2470,7 +2468,7 @@ def start( '(see `sky status`), or the -a/--all flag.') if all: - if len(clusters) > 0: + if clusters: click.echo('Both --all and cluster(s) specified for sky start. ' 'Letting --all take effect.') @@ -2800,7 +2798,7 @@ def _down_or_stop_clusters( option_str = '{stop,down}' operation = f'{verb} auto{option_str} on' - if len(names) > 0: + if names: controllers = [ name for name in names if controller_utils.Controllers.from_name(name) is not None @@ -2814,7 +2812,7 @@ def _down_or_stop_clusters( # Make sure the controllers are explicitly specified without other # normal clusters. if controllers: - if len(names) != 0: + if names: names_str = ', '.join(map(repr, names)) raise click.UsageError( f'{operation} controller(s) ' @@ -2867,7 +2865,7 @@ def _down_or_stop_clusters( if apply_to_all: all_clusters = global_user_state.get_clusters() - if len(names) > 0: + if names: click.echo( f'Both --all and cluster(s) specified for `sky {command}`. ' 'Letting --all take effect.') @@ -2894,7 +2892,7 @@ def _down_or_stop_clusters( click.echo('Cluster(s) not found (tip: see `sky status`).') return - if not no_confirm and len(clusters) > 0: + if not no_confirm and clusters: cluster_str = 'clusters' if len(clusters) > 1 else 'cluster' cluster_list = ', '.join(clusters) click.confirm( @@ -3003,7 +3001,7 @@ def check(clouds: Tuple[str], verbose: bool): # Check only specific clouds - AWS and GCP. sky check aws gcp """ - clouds_arg = clouds if len(clouds) > 0 else None + clouds_arg = clouds if clouds else None sky_check.check(verbose=verbose, clouds=clouds_arg) @@ -3138,7 +3136,7 @@ def _get_kubernetes_realtime_gpu_table( f'capacity ({list(capacity.keys())}), ' f'and available ({list(available.keys())}) ' 'must be same.') - if len(counts) == 0: + if not counts: err_msg = 'No GPUs found in Kubernetes cluster. ' debug_msg = 'To further debug, run: sky check ' if name_filter is not None: @@ -3282,7 +3280,7 @@ def _output(): for tpu in service_catalog.get_tpus(): if tpu in result: tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))]) - if len(tpu_table.get_string()) > 0: + if tpu_table.get_string(): yield '\n\n' yield from tpu_table.get_string() @@ -3393,7 +3391,7 @@ def _output(): yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Cloud GPUs{colorama.Style.RESET_ALL}\n') - if len(result) == 0: + if not result: quantity_str = (f' with requested quantity {quantity}' if quantity else '') cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.' @@ -3522,7 +3520,7 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r # Delete all storage objects. sky storage delete -a """ - if sum([len(names) > 0, all]) != 1: + if sum([bool(names), all]) != 1: raise click.UsageError('Either --all or a name must be specified.') if all: storages = sky.storage_ls() @@ -3601,15 +3599,12 @@ def jobs(): default=False, required=False, help='Skip confirmation prompt.') -# TODO(cooperc): remove this flag once --fast can robustly detect cluster -# yaml config changes +# TODO(cooperc): remove this flag before releasing 0.8.0 @click.option('--fast', default=False, is_flag=True, - help='[Experimental] Launch the job faster by skipping ' - 'controller initialization steps. If you update SkyPilot or ' - 'your local cloud credentials, they will not be reflected until ' - 'you run `sky jobs launch` at least once without this flag.') + help=('[Deprecated] Does nothing. Previous flag behavior is now ' + 'enabled by default.')) @timeline.event @usage_lib.entrypoint def jobs_launch( @@ -3634,7 +3629,7 @@ def jobs_launch( disk_tier: Optional[str], ports: Tuple[str], detach_run: bool, - retry_until_up: bool, + retry_until_up: Optional[bool], yes: bool, fast: bool, ): @@ -3692,6 +3687,16 @@ def jobs_launch( else: retry_until_up = True + # Deprecation. The default behavior is fast, and the flag will be removed. + # The flag was not present in 0.7.x (only nightly), so we will remove before + # 0.8.0 so that it never enters a stable release. + if fast: + click.secho( + 'Flag --fast is deprecated, as the behavior is now default. The ' + 'flag will be removed soon. Please do not use it, so that you ' + 'avoid "No such option" errors.', + fg='yellow') + if not isinstance(task_or_dag, sky.Dag): assert isinstance(task_or_dag, sky.Task), task_or_dag with sky.Dag() as dag: @@ -3733,8 +3738,7 @@ def jobs_launch( managed_jobs.launch(dag, name, detach_run=detach_run, - retry_until_up=retry_until_up, - fast=fast) + retry_until_up=retry_until_up) @jobs.command('queue', cls=_DocumentedCodeCommand) @@ -3875,8 +3879,8 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): exit_if_not_accessible=True) job_id_str = ','.join(map(str, job_ids)) - if sum([len(job_ids) > 0, name is not None, all]) != 1: - argument_str = f'--job-ids {job_id_str}' if len(job_ids) > 0 else '' + if sum([bool(job_ids), name is not None, all]) != 1: + argument_str = f'--job-ids {job_id_str}' if job_ids else '' argument_str += f' --name {name}' if name is not None else '' argument_str += ' --all' if all else '' raise click.UsageError( @@ -4517,9 +4521,9 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool, # Forcefully tear down a specific replica, even in failed status. sky serve down my-service --replica-id 1 --purge """ - if sum([len(service_names) > 0, all]) != 1: - argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len( - service_names) > 0 else '' + if sum([bool(service_names), all]) != 1: + argument_str = (f'SERVICE_NAMES={",".join(service_names)}' + if service_names else '') argument_str += ' --all' if all else '' raise click.UsageError( 'Can only specify one of SERVICE_NAMES or --all. ' @@ -4892,7 +4896,7 @@ def benchmark_launch( if idle_minutes_to_autostop is None: idle_minutes_to_autostop = 5 commandline_args['idle-minutes-to-autostop'] = idle_minutes_to_autostop - if len(env) > 0: + if env: commandline_args['env'] = [f'{k}={v}' for k, v in env] # Launch the benchmarking clusters in detach mode in parallel. @@ -5171,7 +5175,7 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool], raise click.BadParameter( 'Either specify benchmarks or use --all to delete all benchmarks.') to_delete = [] - if len(benchmarks) > 0: + if benchmarks: for benchmark in benchmarks: record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: @@ -5180,7 +5184,7 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool], to_delete.append(record) if all: to_delete = benchmark_state.get_benchmarks() - if len(benchmarks) > 0: + if benchmarks: print('Both --all and benchmark(s) specified ' 'for sky bench delete. Letting --all take effect.') @@ -5282,7 +5286,7 @@ def _deploy_local_cluster(gpus: bool): run_command = shlex.split(run_command) # Setup logging paths - run_timestamp = backend_utils.get_run_timestamp() + run_timestamp = sky_logging.get_run_timestamp() log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, 'local_up.log') tail_cmd = 'tail -n100 -f ' + log_path @@ -5396,7 +5400,7 @@ def _deploy_remote_cluster(ip_file: str, ssh_user: str, ssh_key_path: str, deploy_command = shlex.split(deploy_command) # Setup logging paths - run_timestamp = backend_utils.get_run_timestamp() + run_timestamp = sky_logging.get_run_timestamp() log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, 'local_up.log') tail_cmd = 'tail -n100 -f ' + log_path @@ -5511,7 +5515,7 @@ def local_down(): run_command = shlex.split(down_script_path) # Setup logging paths - run_timestamp = backend_utils.get_run_timestamp() + run_timestamp = sky_logging.get_run_timestamp() log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, 'local_down.log') tail_cmd = 'tail -n100 -f ' + log_path diff --git a/sky/cloud_stores.py b/sky/cloud_stores.py index aab848c7932..1922f9d80f0 100644 --- a/sky/cloud_stores.py +++ b/sky/cloud_stores.py @@ -133,7 +133,7 @@ def is_directory(self, url: str) -> bool: # If is a bucket root, then we only need `gsutil` to succeed # to make sure the bucket exists. It is already a directory. _, key = data_utils.split_gcs_path(url) - if len(key) == 0: + if not key: return True # Otherwise, gsutil ls -d url will return: # --> url.rstrip('/') if url is not a directory diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index c42d67f8ba4..cafc789c5be 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -617,7 +617,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: 'Failed to fetch the availability zones for the account ' f'{identity_str}. It is likely due to permission issues, please' ' check the minimal permission required for AWS: ' - 'https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable= + 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable= f'\n{cls._INDENT_PREFIX}Details: ' f'{common_utils.format_exception(e, use_bracket=True)}') return True, hints diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 145a5d1c26e..25d285da185 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -42,8 +42,7 @@ class Cudo(clouds.Cloud): f'{_INDENT_PREFIX} $ cudoctl init\n' f'{_INDENT_PREFIX}For more info: ' # pylint: disable=line-too-long - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html' - ) + 'https://docs.skypilot.co/en/latest/getting-started/installation.html') _PROJECT_HINT = ( 'Create a project and then set it as the default project,:\n' @@ -51,8 +50,7 @@ class Cudo(clouds.Cloud): f'{_INDENT_PREFIX} $ cudoctl init\n' f'{_INDENT_PREFIX}For more info: ' # pylint: disable=line-too-long - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html' - ) + 'https://docs.skypilot.co/en/latest/getting-started/installation.html') _CLOUD_UNSUPPORTED_FEATURES = { clouds.CloudImplementationFeatures.STOP: 'Stopping not supported.', diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 8a28a35505e..ff200f84147 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -167,7 +167,7 @@ class GCP(clouds.Cloud): # ~/.config/gcloud/application_default_credentials.json. f'{_INDENT_PREFIX} $ gcloud auth application-default login\n' f'{_INDENT_PREFIX}For more info: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long ) _APPLICATION_CREDENTIAL_HINT = ( 'Run the following commands:\n' @@ -175,7 +175,7 @@ class GCP(clouds.Cloud): f'{_INDENT_PREFIX}Or set the environment variable GOOGLE_APPLICATION_CREDENTIALS ' 'to the path of your service account key file.\n' f'{_INDENT_PREFIX}For more info: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long ) _SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier) @@ -830,13 +830,13 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: ret_permissions = request.execute().get('permissions', []) diffs = set(gcp_minimal_permissions).difference(set(ret_permissions)) - if len(diffs) > 0: + if diffs: identity_str = identity[0] if identity else None return False, ( 'The following permissions are not enabled for the current ' f'GCP identity ({identity_str}):\n ' f'{diffs}\n ' - 'For more details, visit: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html') # pylint: disable=line-too-long + 'For more details, visit: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html') # pylint: disable=line-too-long return True, None def get_credential_file_mounts(self) -> Dict[str, str]: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 471639626eb..f9242bd77aa 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -139,7 +139,7 @@ def _existing_allowed_contexts(cls) -> List[str]: use the service account mounted in the pod. """ all_contexts = kubernetes_utils.get_all_kube_context_names() - if len(all_contexts) == 0: + if not all_contexts: return [] all_contexts = set(all_contexts) @@ -395,7 +395,7 @@ def make_deploy_resources_variables( tpu_requested = True k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY else: - k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY + k8s_resource_key = kubernetes_utils.get_gpu_resource_key() port_mode = network_utils.get_port_mode(None) diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 95f4efe95e3..d4ae6f298d2 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -390,7 +390,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: short_credential_help_str = ( 'For more details, refer to: ' # pylint: disable=line-too-long - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci' + 'https://docs.skypilot.co/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci' ) credential_help_str = ( 'To configure credentials, go to: ' diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index 69a0d69ca61..dc309d9c9dd 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -258,7 +258,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: return False, ( 'Failed to access Paperspace Cloud with credentials.\n ' 'To configure credentials, follow the instructions at: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#paperspace\n ' + 'https://docs.skypilot.co/en/latest/getting-started/installation.html#paperspace\n ' 'Generate API key and create a json at `~/.paperspace/config.json` with \n ' ' {"apiKey": "[YOUR API KEY]"}\n ' f'Reason: {str(e)}') diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index 6ddbfe0f1e9..b1cc016abd9 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -253,7 +253,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: ' Credentials can be set up by running: \n' f' $ pip install runpod \n' f' $ runpod config\n' - ' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long + ' For more information, see https://docs.skypilot.co/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long ) return True, None diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 67c6e09b27e..0fce7c25f6a 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -270,9 +270,10 @@ def _get_candidate_str(loc: str, all_loc: List[str]) -> str: candidate_loc = difflib.get_close_matches(loc, all_loc, n=5, cutoff=0.9) candidate_loc = sorted(candidate_loc) candidate_strs = '' - if len(candidate_loc) > 0: + if candidate_loc: candidate_strs = ', '.join(candidate_loc) candidate_strs = f'\nDid you mean one of these: {candidate_strs!r}?' + return candidate_strs def _get_all_supported_regions_str() -> str: @@ -286,7 +287,7 @@ def _get_all_supported_regions_str() -> str: filter_df = df if region is not None: filter_df = _filter_region_zone(filter_df, region, zone=None) - if len(filter_df) == 0: + if filter_df.empty: with ux_utils.print_exception_no_traceback(): error_msg = (f'Invalid region {region!r}') candidate_strs = _get_candidate_str( @@ -296,7 +297,7 @@ def _get_all_supported_regions_str() -> str: faq_msg = ( '\nIf a region is not included in the following ' 'list, please check the FAQ docs for how to fetch ' - 'its catalog info.\nhttps://skypilot.readthedocs.io' + 'its catalog info.\nhttps://docs.skypilot.co' '/en/latest/reference/faq.html#advanced-how-to-' 'make-skypilot-use-all-global-regions') error_msg += faq_msg + _get_all_supported_regions_str() @@ -310,7 +311,7 @@ def _get_all_supported_regions_str() -> str: if zone is not None: maybe_region_df = filter_df filter_df = filter_df[filter_df['AvailabilityZone'] == zone] - if len(filter_df) == 0: + if filter_df.empty: region_str = f' for region {region!r}' if region else '' df = maybe_region_df if region else df with ux_utils.print_exception_no_traceback(): @@ -378,7 +379,7 @@ def get_vcpus_mem_from_instance_type_impl( instance_type: str, ) -> Tuple[Optional[float], Optional[float]]: df = _get_instance_type(df, instance_type, None) - if len(df) == 0: + if df.empty: with ux_utils.print_exception_no_traceback(): raise ValueError(f'No instance type {instance_type} found.') assert len(set(df['vCPUs'])) == 1, ('Cannot determine the number of vCPUs ' @@ -484,7 +485,7 @@ def get_accelerators_from_instance_type_impl( instance_type: str, ) -> Optional[Dict[str, Union[int, float]]]: df = _get_instance_type(df, instance_type, None) - if len(df) == 0: + if df.empty: with ux_utils.print_exception_no_traceback(): raise ValueError(f'No instance type {instance_type} found.') row = df.iloc[0] @@ -518,7 +519,7 @@ def get_instance_type_for_accelerator_impl( result = df[(df['AcceleratorName'].str.fullmatch(acc_name, case=False)) & (abs(df['AcceleratorCount'] - acc_count) <= 0.01)] result = _filter_region_zone(result, region, zone) - if len(result) == 0: + if result.empty: fuzzy_result = df[ (df['AcceleratorName'].str.contains(acc_name, case=False)) & (df['AcceleratorCount'] >= acc_count)] @@ -527,7 +528,7 @@ def get_instance_type_for_accelerator_impl( fuzzy_result = fuzzy_result[['AcceleratorName', 'AcceleratorCount']].drop_duplicates() fuzzy_candidate_list = [] - if len(fuzzy_result) > 0: + if not fuzzy_result.empty: for _, row in fuzzy_result.iterrows(): acc_cnt = float(row['AcceleratorCount']) acc_count_display = (int(acc_cnt) if acc_cnt.is_integer() else @@ -539,7 +540,7 @@ def get_instance_type_for_accelerator_impl( result = _filter_with_cpus(result, cpus) result = _filter_with_mem(result, memory) result = _filter_region_zone(result, region, zone) - if len(result) == 0: + if result.empty: return ([], []) # Current strategy: choose the cheapest instance @@ -680,7 +681,7 @@ def get_image_id_from_tag_impl(df: 'pd.DataFrame', tag: str, df = _filter_region_zone(df, region, zone=None) assert len(df) <= 1, ('Multiple images found for tag ' f'{tag} in region {region}') - if len(df) == 0: + if df.empty: return None image_id = df['ImageId'].iloc[0] if pd.isna(image_id): @@ -694,4 +695,4 @@ def is_image_tag_valid_impl(df: 'pd.DataFrame', tag: str, df = df[df['Tag'] == tag] df = _filter_region_zone(df, region, zone=None) df = df.dropna(subset=['ImageId']) - return len(df) > 0 + return not df.empty diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 4aef41f9c90..00768d5c6bb 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -134,7 +134,7 @@ def get_pricing_df(region: Optional[str] = None) -> 'pd.DataFrame': content_str = r.content.decode('ascii') content = json.loads(content_str) items = content.get('Items', []) - if len(items) == 0: + if not items: break all_items += items url = content.get('NextPageLink') diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py b/sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py index 216e8ed9b4f..c08a56955a0 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py @@ -534,7 +534,7 @@ def initialize_images_csv(csv_saving_path: str, vc_object, gpu_name = tag_name.split('-')[1] if gpu_name not in gpu_tags: gpu_tags.append(gpu_name) - if len(gpu_tags) > 0: + if gpu_tags: gpu_tags_str = str(gpu_tags).replace('\'', '\"') f.write(f'{item.id},{vcenter_name},{item_cpu},{item_memory}' f',,,\'{gpu_tags_str}\'\n') diff --git a/sky/clouds/utils/scp_utils.py b/sky/clouds/utils/scp_utils.py index 3e91e22e6d9..4efc79313c5 100644 --- a/sky/clouds/utils/scp_utils.py +++ b/sky/clouds/utils/scp_utils.py @@ -65,7 +65,7 @@ def __setitem__(self, instance_id: str, value: Optional[Dict[str, if value is None: if instance_id in metadata: metadata.pop(instance_id) # del entry - if len(metadata) == 0: + if not metadata: if os.path.exists(self.path): os.remove(self.path) return @@ -84,7 +84,7 @@ def refresh(self, instance_ids: List[str]) -> None: for instance_id in list(metadata.keys()): if instance_id not in instance_ids: del metadata[instance_id] - if len(metadata) == 0: + if not metadata: os.remove(self.path) return with open(self.path, 'w', encoding='utf-8') as f: @@ -410,7 +410,7 @@ def list_security_groups(self, vpc_id=None, sg_name=None): parameter.append('vpcId=' + vpc_id) if sg_name is not None: parameter.append('securityGroupName=' + sg_name) - if len(parameter) > 0: + if parameter: url = url + '?' + '&'.join(parameter) return self._get(url) diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index 92e62a8a240..1fd76400c9f 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -266,7 +266,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: 'Run the following commands:' f'\n{cls._INDENT_PREFIX} $ pip install skypilot[vSphere]' f'\n{cls._INDENT_PREFIX}Credentials may also need to be set. ' - 'For more details. See https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vmware-vsphere' # pylint: disable=line-too-long + 'For more details. See https://docs.skypilot.co/en/latest/getting-started/installation.html#vmware-vsphere' # pylint: disable=line-too-long f'{common_utils.format_exception(e, use_bracket=True)}') required_keys = ['name', 'username', 'password', 'clusters'] diff --git a/sky/core.py b/sky/core.py index 9f1288d7fb6..36b3d45b849 100644 --- a/sky/core.py +++ b/sky/core.py @@ -732,7 +732,7 @@ def cancel( f'{colorama.Fore.YELLOW}' f'Cancelling latest running job on cluster {cluster_name!r}...' f'{colorama.Style.RESET_ALL}') - elif len(job_ids): + elif job_ids: # all = False, len(job_ids) > 0 => cancel the specified jobs. jobs_str = ', '.join(map(str, job_ids)) sky_logging.print( @@ -817,7 +817,7 @@ def download_logs( backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend), backend - if job_ids is not None and len(job_ids) == 0: + if job_ids is not None and not job_ids: return {} usage_lib.record_cluster_name_for_current_operation(cluster_name) @@ -866,7 +866,7 @@ def job_status(cluster_name: str, f'of type {backend.__class__.__name__!r}.') assert isinstance(handle, backends.CloudVmRayResourceHandle), handle - if job_ids is not None and len(job_ids) == 0: + if job_ids is not None and not job_ids: return {} sky_logging.print(f'{colorama.Fore.YELLOW}' diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index bb5bc504770..dc7a9b80a23 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -22,6 +22,7 @@ from sky.adaptors import gcp from sky.adaptors import ibm from sky.skylet import constants +from sky.skylet import log_lib from sky.utils import common_utils from sky.utils import ux_utils @@ -432,6 +433,7 @@ def _group_files_by_dir( def parallel_upload(source_path_list: List[str], filesync_command_generator: Callable[[str, List[str]], str], dirsync_command_generator: Callable[[str, str], str], + log_path: str, bucket_name: str, access_denied_message: str, create_dirs: bool = False, @@ -447,6 +449,7 @@ def parallel_upload(source_path_list: List[str], for a list of files belonging to the same dir. dirsync_command_generator: Callable that generates rsync command for a directory. + log_path: Path to the log file. access_denied_message: Message to intercept from the underlying upload utility when permissions are insufficient. Used in exception handling. @@ -479,7 +482,7 @@ def parallel_upload(source_path_list: List[str], p.starmap( run_upload_cli, zip(commands, [access_denied_message] * len(commands), - [bucket_name] * len(commands))) + [bucket_name] * len(commands), [log_path] * len(commands))) def get_gsutil_command() -> Tuple[str, str]: @@ -520,37 +523,27 @@ def get_gsutil_command() -> Tuple[str, str]: return gsutil_alias, alias_gen -def run_upload_cli(command: str, access_denied_message: str, bucket_name: str): - # TODO(zhwu): Use log_lib.run_with_log() and redirect the output - # to a log file. - with subprocess.Popen(command, - stderr=subprocess.PIPE, - stdout=subprocess.DEVNULL, - shell=True) as process: - stderr = [] - assert process.stderr is not None # for mypy - while True: - line = process.stderr.readline() - if not line: - break - str_line = line.decode('utf-8') - stderr.append(str_line) - if access_denied_message in str_line: - process.kill() - with ux_utils.print_exception_no_traceback(): - raise PermissionError( - 'Failed to upload files to ' - 'the remote bucket. The bucket does not have ' - 'write permissions. It is possible that ' - 'the bucket is public.') - returncode = process.wait() - if returncode != 0: - stderr_str = '\n'.join(stderr) - with ux_utils.print_exception_no_traceback(): - logger.error(stderr_str) - raise exceptions.StorageUploadError( - f'Upload to bucket failed for store {bucket_name}. ' - 'Please check the logs.') +def run_upload_cli(command: str, access_denied_message: str, bucket_name: str, + log_path: str): + returncode, stdout, stderr = log_lib.run_with_log(command, + log_path, + shell=True, + require_outputs=True) + if access_denied_message in stderr: + with ux_utils.print_exception_no_traceback(): + raise PermissionError('Failed to upload files to ' + 'the remote bucket. The bucket does not have ' + 'write permissions. It is possible that ' + 'the bucket is public.') + if returncode != 0: + with ux_utils.print_exception_no_traceback(): + logger.error(stderr) + raise exceptions.StorageUploadError( + f'Upload to bucket failed for store {bucket_name}. ' + f'Please check the logs: {log_path}') + if not stdout: + logger.debug('No file uploaded. This could be due to an error or ' + 'because all files already exist on the cloud.') def get_cos_regions() -> List[str]: diff --git a/sky/data/storage.py b/sky/data/storage.py index e87e7b4c939..ca85154743f 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -72,6 +72,8 @@ 'Bucket {bucket_name!r} does not exist. ' 'It may have been deleted externally.') +_STORAGE_LOG_FILE_NAME = 'storage_sync.log' + def get_cached_enabled_storage_clouds_or_refresh( raise_if_no_cloud_access: bool = False) -> List[str]: @@ -1080,7 +1082,7 @@ def add_if_not_none(key: str, value: Optional[Any]): add_if_not_none('source', self.source) stores = None - if len(self.stores) > 0: + if self.stores: stores = ','.join([store.value for store in self.stores]) add_if_not_none('store', stores) add_if_not_none('persistent', self.persistent) @@ -1170,7 +1172,7 @@ def _validate(self): 'Storage \'store: s3\' specified, but ' \ 'AWS access is disabled. To fix, enable '\ 'AWS by running `sky check`. More info: '\ - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long ) @classmethod @@ -1344,17 +1346,24 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): else: source_message = source_path_list[0] + log_path = sky_logging.generate_tmp_logging_file_path( + _STORAGE_LOG_FILE_NAME) + sync_path = f'{source_message} -> s3://{self.name}/' with rich_utils.safe_status( - ux_utils.spinner_message(f'Syncing {source_message} -> ' - f's3://{self.name}/')): + ux_utils.spinner_message(f'Syncing {sync_path}', + log_path=log_path)): data_utils.parallel_upload( source_path_list, get_file_sync_command, get_dir_sync_command, + log_path, self.name, self._ACCESS_DENIED_MESSAGE, create_dirs=create_dirs, max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS) + logger.info( + ux_utils.finishing_message(f'Storage synced: {sync_path}', + log_path)) def _transfer_to_s3(self) -> None: assert isinstance(self.source, str), self.source @@ -1612,7 +1621,7 @@ def _validate(self): 'Storage \'store: gcs\' specified, but ' 'GCP access is disabled. To fix, enable ' 'GCP by running `sky check`. ' - 'More info: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.') # pylint: disable=line-too-long + 'More info: https://docs.skypilot.co/en/latest/getting-started/installation.html.') # pylint: disable=line-too-long @classmethod def validate_name(cls, name: str) -> str: @@ -1765,13 +1774,19 @@ def batch_gsutil_cp(self, gsutil_alias, alias_gen = data_utils.get_gsutil_command() sync_command = (f'{alias_gen}; echo "{copy_list}" | {gsutil_alias} ' f'cp -e -n -r -I gs://{self.name}') - + log_path = sky_logging.generate_tmp_logging_file_path( + _STORAGE_LOG_FILE_NAME) + sync_path = f'{source_message} -> gs://{self.name}/' with rich_utils.safe_status( - ux_utils.spinner_message(f'Syncing {source_message} -> ' - f'gs://{self.name}/')): + ux_utils.spinner_message(f'Syncing {sync_path}', + log_path=log_path)): data_utils.run_upload_cli(sync_command, self._ACCESS_DENIED_MESSAGE, - bucket_name=self.name) + bucket_name=self.name, + log_path=log_path) + logger.info( + ux_utils.finishing_message(f'Storage synced: {sync_path}', + log_path)) def batch_gsutil_rsync(self, source_path_list: List[Path], @@ -1821,17 +1836,24 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): else: source_message = source_path_list[0] + log_path = sky_logging.generate_tmp_logging_file_path( + _STORAGE_LOG_FILE_NAME) + sync_path = f'{source_message} -> gs://{self.name}/' with rich_utils.safe_status( - ux_utils.spinner_message(f'Syncing {source_message} -> ' - f'gs://{self.name}/')): + ux_utils.spinner_message(f'Syncing {sync_path}', + log_path=log_path)): data_utils.parallel_upload( source_path_list, get_file_sync_command, get_dir_sync_command, + log_path, self.name, self._ACCESS_DENIED_MESSAGE, create_dirs=create_dirs, max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS) + logger.info( + ux_utils.finishing_message(f'Storage synced: {sync_path}', + log_path)) def _transfer_to_gcs(self) -> None: if isinstance(self.source, str) and self.source.startswith('s3://'): @@ -2145,7 +2167,7 @@ def _validate(self): 'Storage "store: azure" specified, but ' 'Azure access is disabled. To fix, enable ' 'Azure by running `sky check`. More info: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long ) @classmethod @@ -2570,17 +2592,24 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: container_endpoint = data_utils.AZURE_CONTAINER_URL.format( storage_account_name=self.storage_account_name, container_name=self.name) + log_path = sky_logging.generate_tmp_logging_file_path( + _STORAGE_LOG_FILE_NAME) + sync_path = f'{source_message} -> {container_endpoint}/' with rich_utils.safe_status( - ux_utils.spinner_message( - f'Syncing {source_message} -> {container_endpoint}/')): + ux_utils.spinner_message(f'Syncing {sync_path}', + log_path=log_path)): data_utils.parallel_upload( source_path_list, get_file_sync_command, get_dir_sync_command, + log_path, self.name, self._ACCESS_DENIED_MESSAGE, create_dirs=create_dirs, max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS) + logger.info( + ux_utils.finishing_message(f'Storage synced: {sync_path}', + log_path)) def _get_bucket(self) -> Tuple[str, bool]: """Obtains the AZ Container. @@ -2861,7 +2890,7 @@ def _validate(self): 'Storage \'store: r2\' specified, but ' \ 'Cloudflare R2 access is disabled. To fix, '\ 'enable Cloudflare R2 by running `sky check`. '\ - 'More info: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long + 'More info: https://docs.skypilot.co/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long ) def initialize(self): @@ -2986,17 +3015,24 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): else: source_message = source_path_list[0] + log_path = sky_logging.generate_tmp_logging_file_path( + _STORAGE_LOG_FILE_NAME) + sync_path = f'{source_message} -> r2://{self.name}/' with rich_utils.safe_status( - ux_utils.spinner_message( - f'Syncing {source_message} -> r2://{self.name}/')): + ux_utils.spinner_message(f'Syncing {sync_path}', + log_path=log_path)): data_utils.parallel_upload( source_path_list, get_file_sync_command, get_dir_sync_command, + log_path, self.name, self._ACCESS_DENIED_MESSAGE, create_dirs=create_dirs, max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS) + logger.info( + ux_utils.finishing_message(f'Storage synced: {sync_path}', + log_path)) def _transfer_to_r2(self) -> None: assert isinstance(self.source, str), self.source @@ -3439,17 +3475,24 @@ def get_file_sync_command(base_dir_path, file_names) -> str: else: source_message = source_path_list[0] + log_path = sky_logging.generate_tmp_logging_file_path( + _STORAGE_LOG_FILE_NAME) + sync_path = f'{source_message} -> cos://{self.region}/{self.name}/' with rich_utils.safe_status( - ux_utils.spinner_message(f'Syncing {source_message} -> ' - f'cos://{self.region}/{self.name}/')): + ux_utils.spinner_message(f'Syncing {sync_path}', + log_path=log_path)): data_utils.parallel_upload( source_path_list, get_file_sync_command, get_dir_sync_command, + log_path, self.name, self._ACCESS_DENIED_MESSAGE, create_dirs=create_dirs, max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS) + logger.info( + ux_utils.finishing_message(f'Storage synced: {sync_path}', + log_path)) def _get_bucket(self) -> Tuple[StorageHandle, bool]: """returns IBM COS bucket object if exists, otherwise creates it. diff --git a/sky/jobs/core.py b/sky/jobs/core.py index 9cde3443816..3718d0ac67c 100644 --- a/sky/jobs/core.py +++ b/sky/jobs/core.py @@ -37,12 +37,13 @@ @timeline.event @usage_lib.entrypoint def launch( - task: Union['sky.Task', 'sky.Dag'], - name: Optional[str] = None, - stream_logs: bool = True, - detach_run: bool = False, - retry_until_up: bool = False, - fast: bool = False, + task: Union['sky.Task', 'sky.Dag'], + name: Optional[str] = None, + stream_logs: bool = True, + detach_run: bool = False, + retry_until_up: bool = False, + # TODO(cooperc): remove fast arg before 0.8.0 + fast: bool = True, # pylint: disable=unused-argument for compatibility ) -> None: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Launch a managed job. @@ -54,9 +55,8 @@ def launch( managed job. name: Name of the managed job. detach_run: Whether to detach the run. - fast: Whether to use sky.launch(fast=True) for the jobs controller. If - True, the SkyPilot wheel and the cloud credentials may not be updated - on the jobs controller. + fast: [Deprecated] Does nothing, and will be removed soon. We will + always use fast mode as it's fully safe now. Raises: ValueError: cluster does not exist. Or, the entrypoint is not a valid @@ -149,7 +149,7 @@ def launch( idle_minutes_to_autostop=skylet_constants. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, - fast=fast, + fast=True, _disable_controller_check=True) @@ -347,8 +347,8 @@ def cancel(name: Optional[str] = None, stopped_message='All managed jobs should have finished.') job_id_str = ','.join(map(str, job_ids)) - if sum([len(job_ids) > 0, name is not None, all]) != 1: - argument_str = f'job_ids={job_id_str}' if len(job_ids) > 0 else '' + if sum([bool(job_ids), name is not None, all]) != 1: + argument_str = f'job_ids={job_id_str}' if job_ids else '' argument_str += f' name={name}' if name is not None else '' argument_str += ' all' if all else '' with ux_utils.print_exception_no_traceback(): diff --git a/sky/jobs/state.py b/sky/jobs/state.py index 9a5ab4b3cad..31dcfcfd5eb 100644 --- a/sky/jobs/state.py +++ b/sky/jobs/state.py @@ -591,7 +591,7 @@ def get_latest_task_id_status( If the job_id does not exist, (None, None) will be returned. """ id_statuses = _get_all_task_ids_statuses(job_id) - if len(id_statuses) == 0: + if not id_statuses: return None, None task_id, status = id_statuses[-1] for task_id, status in id_statuses: @@ -617,7 +617,7 @@ def get_failure_reason(job_id: int) -> Optional[str]: WHERE spot_job_id=(?) ORDER BY task_id ASC""", (job_id,)).fetchall() reason = [r[0] for r in reason if r[0] is not None] - if len(reason) == 0: + if not reason: return None return reason[0] diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py index 267c205285b..e5bbced997c 100644 --- a/sky/jobs/utils.py +++ b/sky/jobs/utils.py @@ -234,11 +234,11 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str: if job_ids is None: job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None) job_ids = list(set(job_ids)) - if len(job_ids) == 0: + if not job_ids: return 'No job to cancel.' job_id_str = ', '.join(map(str, job_ids)) logger.info(f'Cancelling jobs {job_id_str}.') - cancelled_job_ids = [] + cancelled_job_ids: List[int] = [] for job_id in job_ids: # Check the status of the managed job status. If it is in # terminal state, we can safely skip it. @@ -268,7 +268,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str: shutil.copy(str(signal_file), str(legacy_signal_file)) cancelled_job_ids.append(job_id) - if len(cancelled_job_ids) == 0: + if not cancelled_job_ids: return 'No job to cancel.' identity_str = f'Job with ID {cancelled_job_ids[0]} is' if len(cancelled_job_ids) > 1: @@ -281,7 +281,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str: def cancel_job_by_name(job_name: str) -> str: """Cancel a job by name.""" job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name) - if len(job_ids) == 0: + if not job_ids: return f'No running job found with name {job_name!r}.' if len(job_ids) > 1: return (f'{colorama.Fore.RED}Multiple running jobs found ' @@ -515,7 +515,7 @@ def stream_logs(job_id: Optional[int], for job in managed_jobs if job['job_name'] == job_name } - if len(managed_job_ids) == 0: + if not managed_job_ids: return f'No managed job found with name {job_name!r}.' if len(managed_job_ids) > 1: job_ids_str = ', '.join( @@ -541,7 +541,7 @@ def stream_logs(job_id: Optional[int], if job_id is None: assert job_name is not None job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name) - if len(job_ids) == 0: + if not job_ids: return f'No running managed job found with name {job_name!r}.' if len(job_ids) > 1: raise ValueError( diff --git a/sky/optimizer.py b/sky/optimizer.py index 2f70dd39429..c5a631c213b 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -188,7 +188,7 @@ def _remove_dummy_source_sink_nodes(dag: 'dag_lib.Dag'): """Removes special Source and Sink nodes.""" source = [t for t in dag.tasks if t.name == _DUMMY_SOURCE_NAME] sink = [t for t in dag.tasks if t.name == _DUMMY_SINK_NAME] - if len(source) == len(sink) == 0: + if not source and not sink: return assert len(source) == len(sink) == 1, dag.tasks dag.remove(source[0]) @@ -1298,7 +1298,7 @@ def _fill_in_launchable_resources( resources, num_nodes=task.num_nodes) if feasible_resources.hint is not None: hints[cloud] = feasible_resources.hint - if len(feasible_resources.resources_list) > 0: + if feasible_resources.resources_list: # Assume feasible_resources is sorted by prices. Guaranteed by # the implementation of get_feasible_launchable_resources and # the underlying service_catalog filtering @@ -1310,7 +1310,7 @@ def _fill_in_launchable_resources( else: all_fuzzy_candidates.update( feasible_resources.fuzzy_candidate_list) - if len(launchable[resources]) == 0: + if not launchable[resources]: clouds_str = str(clouds_list) if len(clouds_list) > 1 else str( clouds_list[0]) num_node_str = '' diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index 6a8c77eafed..ffa87c3a011 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -279,7 +279,7 @@ def _has_igw_route(route_tables): logger.debug(f'subnet {subnet_id} route tables: {route_tables}') if _has_igw_route(route_tables): return True - if len(route_tables) > 0: + if route_tables: return False # Handle the case that a "main" route table is implicitly associated with @@ -454,7 +454,7 @@ def _vpc_id_from_security_group_ids(ec2, sg_ids: List[str]) -> Any: no_sg_msg = ('Failed to detect a security group with id equal to any of ' 'the configured SecurityGroupIds.') - assert len(vpc_ids) > 0, no_sg_msg + assert vpc_ids, no_sg_msg return vpc_ids[0] diff --git a/sky/provision/gcp/config.py b/sky/provision/gcp/config.py index a8292669a7c..a99267eb0b9 100644 --- a/sky/provision/gcp/config.py +++ b/sky/provision/gcp/config.py @@ -397,7 +397,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str, operation = compute.networks().getEffectiveFirewalls(project=project_id, network=vpc_name) response = operation.execute() - if len(response) == 0: + if not response: return False effective_rules = response['firewalls'] @@ -515,7 +515,7 @@ def _create_rules(project_id: str, compute, rules, vpc_name): rule_list = _list_firewall_rules(project_id, compute, filter=f'(name={rule_name})') - if len(rule_list) > 0: + if rule_list: _delete_firewall_rule(project_id, compute, rule_name) body = rule.copy() @@ -624,7 +624,7 @@ def get_usable_vpc_and_subnet( vpc_list = _list_vpcnets(project_id, compute, filter=f'name={constants.SKYPILOT_VPC_NAME}') - if len(vpc_list) == 0: + if not vpc_list: body = constants.VPC_TEMPLATE.copy() body['name'] = body['name'].format(VPC_NAME=constants.SKYPILOT_VPC_NAME) body['selfLink'] = body['selfLink'].format( diff --git a/sky/provision/gcp/constants.py b/sky/provision/gcp/constants.py index 4f442709b0c..7b3fd4046b5 100644 --- a/sky/provision/gcp/constants.py +++ b/sky/provision/gcp/constants.py @@ -142,7 +142,7 @@ ] # A list of permissions required to run SkyPilot on GCP. -# Keep this in sync with https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long +# Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long VM_MINIMAL_PERMISSIONS = [ 'compute.disks.create', 'compute.disks.list', diff --git a/sky/provision/kubernetes/config.py b/sky/provision/kubernetes/config.py index 370430720f0..0fe920be9d6 100644 --- a/sky/provision/kubernetes/config.py +++ b/sky/provision/kubernetes/config.py @@ -232,7 +232,7 @@ def _get_resource(container_resources: Dict[str, Any], resource_name: str, # Look for keys containing the resource_name. For example, # the key 'nvidia.com/gpu' contains the key 'gpu'. matching_keys = [key for key in resources if resource_name in key.lower()] - if len(matching_keys) == 0: + if not matching_keys: return float('inf') if len(matching_keys) > 1: # Should have only one match -- mostly relevant for gpu. @@ -265,7 +265,7 @@ def _configure_autoscaler_service_account( field_selector = f'metadata.name={name}' accounts = (kubernetes.core_api(context).list_namespaced_service_account( namespace, field_selector=field_selector).items) - if len(accounts) > 0: + if accounts: assert len(accounts) == 1 # Nothing to check for equality and patch here, # since the service_account.metadata.name is the only important @@ -308,7 +308,7 @@ def _configure_autoscaler_role(namespace: str, context: Optional[str], field_selector = f'metadata.name={name}' roles = (kubernetes.auth_api(context).list_namespaced_role( namespace, field_selector=field_selector).items) - if len(roles) > 0: + if roles: assert len(roles) == 1 existing_role = roles[0] # Convert to k8s object to compare @@ -374,7 +374,7 @@ def _configure_autoscaler_role_binding( field_selector = f'metadata.name={name}' role_bindings = (kubernetes.auth_api(context).list_namespaced_role_binding( rb_namespace, field_selector=field_selector).items) - if len(role_bindings) > 0: + if role_bindings: assert len(role_bindings) == 1 existing_binding = role_bindings[0] new_rb = kubernetes_utils.dict_to_k8s_object(binding, 'V1RoleBinding') @@ -415,7 +415,7 @@ def _configure_autoscaler_cluster_role(namespace, context, field_selector = f'metadata.name={name}' cluster_roles = (kubernetes.auth_api(context).list_cluster_role( field_selector=field_selector).items) - if len(cluster_roles) > 0: + if cluster_roles: assert len(cluster_roles) == 1 existing_cr = cluster_roles[0] new_cr = kubernetes_utils.dict_to_k8s_object(role, 'V1ClusterRole') @@ -460,7 +460,7 @@ def _configure_autoscaler_cluster_role_binding( field_selector = f'metadata.name={name}' cr_bindings = (kubernetes.auth_api(context).list_cluster_role_binding( field_selector=field_selector).items) - if len(cr_bindings) > 0: + if cr_bindings: assert len(cr_bindings) == 1 existing_binding = cr_bindings[0] new_binding = kubernetes_utils.dict_to_k8s_object( @@ -639,7 +639,7 @@ def _configure_services(namespace: str, context: Optional[str], field_selector = f'metadata.name={name}' services = (kubernetes.core_api(context).list_namespaced_service( namespace, field_selector=field_selector).items) - if len(services) > 0: + if services: assert len(services) == 1 existing_service = services[0] # Convert to k8s object to compare diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 11e3d2d80ad..c431b023ab9 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -180,6 +180,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): # case we will need to update this logic. # TODO(Doyoung): Update the error message raised # with the multi-host TPU support. + gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long if 'Insufficient google.com/tpu' in event_message: extra_msg = ( f'Verify if ' @@ -192,14 +193,15 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): pod, extra_msg, details=event_message)) - elif (('Insufficient nvidia.com/gpu' + elif ((f'Insufficient {gpu_resource_key}' in event_message) or ('didn\'t match Pod\'s node affinity/selector' in event_message)): extra_msg = ( - f'Verify if ' - f'{pod.spec.node_selector[label_key]}' - ' is available in the cluster.') + f'Verify if any node matching label ' + f'{pod.spec.node_selector[label_key]} and ' + f'sufficient resource {gpu_resource_key} ' + f'is available in the cluster.') raise config_lib.KubernetesError( _lack_resource_msg('GPU', pod, @@ -722,13 +724,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str, 'Continuing without using nvidia RuntimeClass.\n' 'If you are on a K3s cluster, manually ' 'override runtimeClassName in ~/.sky/config.yaml. ' - 'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long + 'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long needs_gpus = False limits = pod_spec['spec']['containers'][0].get('resources', {}).get('limits') if limits is not None: - needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0 + needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0 # TPU pods provisioned on GKE use the default containerd runtime. # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index b16482e5072..29fcf181edd 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -230,7 +230,7 @@ def get_ingress_external_ip_and_ports( namespace, _request_timeout=kubernetes.API_TIMEOUT).items if item.metadata.name == 'ingress-nginx-controller' ] - if len(ingress_services) == 0: + if not ingress_services: return (None, None) ingress_service = ingress_services[0] diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 7442c9be7a6..87ccd6b105d 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -438,7 +438,7 @@ def detect_accelerator_resource( nodes = get_kubernetes_nodes(context) for node in nodes: cluster_resources.update(node.status.allocatable.keys()) - has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or + has_accelerator = (get_gpu_resource_key() in cluster_resources or TPU_RESOURCE_KEY in cluster_resources) return has_accelerator, cluster_resources @@ -583,7 +583,7 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', node for node in nodes if gpu_label_key in node.metadata.labels and node.metadata.labels[gpu_label_key] == gpu_label_val ] - assert len(gpu_nodes) > 0, 'GPU nodes not found' + assert gpu_nodes, 'GPU nodes not found' if is_tpu_on_gke(acc_type): # If requested accelerator is a TPU type, check if the cluster # has sufficient TPU resource to meet the requirement. @@ -972,7 +972,7 @@ def is_kubeconfig_exec_auth( '~/.sky/config.yaml:\n' ' kubernetes:\n' ' remote_identity: SERVICE_ACCOUNT\n' - ' More: https://skypilot.readthedocs.io/en/latest/' + ' More: https://docs.skypilot.co/en/latest/' 'reference/config.html') return True, exec_msg return False, None @@ -1234,7 +1234,8 @@ def construct_ssh_jump_command( '-o StrictHostKeyChecking=no ' '-o UserKnownHostsFile=/dev/null ' f'-o IdentitiesOnly=yes ' - f'-W %h:%p {ssh_jump_user}@{ssh_jump_ip}') + r'-W \[%h\]:%p ' + f'{ssh_jump_user}@{ssh_jump_ip}') if ssh_jump_port is not None: ssh_jump_proxy_command += f' -p {ssh_jump_port} ' if proxy_cmd_path is not None: @@ -1525,7 +1526,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str], def find(l, predicate): """Utility function to find element in given list""" results = [x for x in l if predicate(x)] - return results[0] if len(results) > 0 else None + return results[0] if results else None # Get the SSH jump pod name from the head pod try: @@ -2253,10 +2254,11 @@ def get_node_accelerator_count(attribute_dict: dict) -> int: Number of accelerators allocated or available from the node. If no resource is found, it returns 0. """ - assert not (GPU_RESOURCE_KEY in attribute_dict and + gpu_resource_name = get_gpu_resource_key() + assert not (gpu_resource_name in attribute_dict and TPU_RESOURCE_KEY in attribute_dict) - if GPU_RESOURCE_KEY in attribute_dict: - return int(attribute_dict[GPU_RESOURCE_KEY]) + if gpu_resource_name in attribute_dict: + return int(attribute_dict[gpu_resource_name]) elif TPU_RESOURCE_KEY in attribute_dict: return int(attribute_dict[TPU_RESOURCE_KEY]) return 0 @@ -2415,3 +2417,18 @@ def process_skypilot_pods( num_pods = len(cluster.pods) cluster.resources_str = f'{num_pods}x {cluster.resources}' return list(clusters.values()), jobs_controllers, serve_controllers + + +def get_gpu_resource_key(): + """Get the GPU resource name to use in kubernetes. + The function first checks for an environment variable. + If defined, it uses its value; otherwise, it returns the default value. + Args: + name (str): Default GPU resource name, default is "nvidia.com/gpu". + Returns: + str: The selected GPU resource name. + """ + # Retrieve GPU resource name from environment variable, if set. + # Else use default. + # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc. + return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY) diff --git a/sky/provision/lambda_cloud/lambda_utils.py b/sky/provision/lambda_cloud/lambda_utils.py index 4d8e6246b6d..cfd8e02ad23 100644 --- a/sky/provision/lambda_cloud/lambda_utils.py +++ b/sky/provision/lambda_cloud/lambda_utils.py @@ -50,7 +50,7 @@ def set(self, instance_id: str, value: Optional[Dict[str, Any]]) -> None: if value is None: if instance_id in metadata: metadata.pop(instance_id) # del entry - if len(metadata) == 0: + if not metadata: if os.path.exists(self.path): os.remove(self.path) return @@ -69,7 +69,7 @@ def refresh(self, instance_ids: List[str]) -> None: for instance_id in list(metadata.keys()): if instance_id not in instance_ids: del metadata[instance_id] - if len(metadata) == 0: + if not metadata: os.remove(self.path) return with open(self.path, 'w', encoding='utf-8') as f: @@ -150,7 +150,7 @@ def create_instances( ['regions_with_capacity_available']) available_regions = [reg['name'] for reg in available_regions] if region not in available_regions: - if len(available_regions) > 0: + if available_regions: aval_reg = ' '.join(available_regions) else: aval_reg = 'None' diff --git a/sky/provision/oci/query_utils.py b/sky/provision/oci/query_utils.py index 47a0438cb21..8cca0629305 100644 --- a/sky/provision/oci/query_utils.py +++ b/sky/provision/oci/query_utils.py @@ -248,7 +248,7 @@ def find_compartment(cls, region) -> str: limit=1) compartments = list_compartments_response.data - if len(compartments) > 0: + if compartments: skypilot_compartment = compartments[0].id return skypilot_compartment @@ -274,7 +274,7 @@ def find_create_vcn_subnet(cls, region) -> Optional[str]: display_name=oci_utils.oci_config.VCN_NAME, lifecycle_state='AVAILABLE') vcns = list_vcns_response.data - if len(vcns) > 0: + if vcns: # Found the VCN. skypilot_vcn = vcns[0].id list_subnets_response = net_client.list_subnets( @@ -359,7 +359,7 @@ def create_vcn_subnet(cls, net_client, if str(s.cidr_block).startswith('all-') and str(s.cidr_block). endswith('-services-in-oracle-services-network') ] - if len(services) > 0: + if services: # Create service gateway for regional services. create_sg_response = net_client.create_service_gateway( create_service_gateway_details=oci_adaptor.oci.core.models. diff --git a/sky/provision/vsphere/common/vim_utils.py b/sky/provision/vsphere/common/vim_utils.py index 33c02db8feb..bde1bc25cf0 100644 --- a/sky/provision/vsphere/common/vim_utils.py +++ b/sky/provision/vsphere/common/vim_utils.py @@ -56,7 +56,7 @@ def get_hosts_by_cluster_names(content, vcenter_name, cluster_name_dicts=None): 'name': cluster.name } for cluster in cluster_view.view] cluster_view.Destroy() - if len(cluster_name_dicts) == 0: + if not cluster_name_dicts: logger.warning(f'vCenter \'{vcenter_name}\' has no clusters') # Retrieve all cluster names from the cluster_name_dicts diff --git a/sky/provision/vsphere/instance.py b/sky/provision/vsphere/instance.py index 787d8c97f62..2075cdb9c36 100644 --- a/sky/provision/vsphere/instance.py +++ b/sky/provision/vsphere/instance.py @@ -162,7 +162,7 @@ def _create_instances( if not gpu_instance: # Find an image for CPU images_df = images_df[images_df['GpuTags'] == '\'[]\''] - if len(images_df) == 0: + if not images_df: logger.error( f'Can not find an image for instance type: {instance_type}.') raise Exception( @@ -185,7 +185,7 @@ def _create_instances( image_instance_mapping_df = image_instance_mapping_df[ image_instance_mapping_df['InstanceType'] == instance_type] - if len(image_instance_mapping_df) == 0: + if not image_instance_mapping_df: raise Exception(f"""There is no image can match instance type named {instance_type} If you are using CPU-only instance, assign an image with tag @@ -218,10 +218,9 @@ def _create_instances( hosts_df = hosts_df[(hosts_df['AvailableCPUs'] / hosts_df['cpuMhz']) >= cpus_needed] hosts_df = hosts_df[hosts_df['AvailableMemory(MB)'] >= memory_needed] - assert len(hosts_df) > 0, ( - f'There is no host available to create the instance ' - f'{vms_item["InstanceType"]}, at least {cpus_needed} ' - f'cpus and {memory_needed}MB memory are required.') + assert hosts_df, (f'There is no host available to create the instance ' + f'{vms_item["InstanceType"]}, at least {cpus_needed} ' + f'cpus and {memory_needed}MB memory are required.') # Sort the hosts df by AvailableCPUs to get the compatible host with the # least resource @@ -365,7 +364,7 @@ def _choose_vsphere_cluster_name(config: common.ProvisionConfig, region: str, skypilot framework-optimized availability_zones""" vsphere_cluster_name = None vsphere_cluster_name_str = config.provider_config['availability_zone'] - if len(vc_object.clusters) > 0: + if vc_object.clusters: for optimized_cluster_name in vsphere_cluster_name_str.split(','): if optimized_cluster_name in [ item['name'] for item in vc_object.clusters diff --git a/sky/provision/vsphere/vsphere_utils.py b/sky/provision/vsphere/vsphere_utils.py index faec5d54930..51f284b0fc6 100644 --- a/sky/provision/vsphere/vsphere_utils.py +++ b/sky/provision/vsphere/vsphere_utils.py @@ -257,7 +257,7 @@ def get_skypilot_profile_id(self): # hard code here. should support configure later. profile_name = 'skypilot_policy' storage_profile_id = None - if len(profile_ids) > 0: + if profile_ids: profiles = pm.PbmRetrieveContent(profileIds=profile_ids) for profile in profiles: if profile_name in profile.name: diff --git a/sky/resources.py b/sky/resources.py index 5184278e02e..68d1b6f9ea8 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -661,7 +661,7 @@ def _validate_and_set_region_zone(self, region: Optional[str], continue valid_clouds.append(cloud) - if len(valid_clouds) == 0: + if not valid_clouds: if len(enabled_clouds) == 1: cloud_str = f'for cloud {enabled_clouds[0]}' else: @@ -773,7 +773,7 @@ def _try_validate_instance_type(self) -> None: for cloud in enabled_clouds: if cloud.instance_type_exists(self._instance_type): valid_clouds.append(cloud) - if len(valid_clouds) == 0: + if not valid_clouds: if len(enabled_clouds) == 1: cloud_str = f'for cloud {enabled_clouds[0]}' else: @@ -1008,7 +1008,7 @@ def _try_validate_labels(self) -> None: f'Label rejected due to {cloud}: {err_msg}' ]) break - if len(invalid_table.rows) > 0: + if invalid_table.rows: with ux_utils.print_exception_no_traceback(): raise ValueError( 'The following labels are invalid:' @@ -1283,7 +1283,7 @@ def copy(self, **override) -> 'Resources': _cluster_config_overrides=override.pop( '_cluster_config_overrides', self._cluster_config_overrides), ) - assert len(override) == 0 + assert not override return resources def valid_on_region_zones(self, region: str, zones: List[str]) -> bool: diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index a4278f192fb..7a6311ad535 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -320,8 +320,8 @@ def select_outdated_replicas_to_scale_down( """Select outdated replicas to scale down.""" if self.update_mode == serve_utils.UpdateMode.ROLLING: - latest_ready_replicas = [] - old_nonterminal_replicas = [] + latest_ready_replicas: List['replica_managers.ReplicaInfo'] = [] + old_nonterminal_replicas: List['replica_managers.ReplicaInfo'] = [] for info in replica_infos: if info.version == self.latest_version: if info.is_ready: diff --git a/sky/serve/core.py b/sky/serve/core.py index f6f6c53ad7b..f71c60b2fef 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -360,7 +360,7 @@ def update( raise RuntimeError(e.error_msg) from e service_statuses = serve_utils.load_service_status(serve_status_payload) - if len(service_statuses) == 0: + if not service_statuses: with ux_utils.print_exception_no_traceback(): raise RuntimeError(f'Cannot find service {service_name!r}.' f'To spin up a service, use {ux_utils.BOLD}' @@ -384,6 +384,17 @@ def update( with ux_utils.print_exception_no_traceback(): raise RuntimeError(prompt) + original_lb_policy = service_record['load_balancing_policy'] + assert task.service is not None, 'Service section not found.' + if original_lb_policy != task.service.load_balancing_policy: + logger.warning( + f'{colorama.Fore.YELLOW}Current load balancing policy ' + f'{original_lb_policy!r} is different from the new policy ' + f'{task.service.load_balancing_policy!r}. Updating the load ' + 'balancing policy is not supported yet and it will be ignored. ' + 'The service will continue to use the current load balancing ' + f'policy.{colorama.Style.RESET_ALL}') + with rich_utils.safe_status( ux_utils.spinner_message('Initializing service')): controller_utils.maybe_translate_local_file_mounts_and_sync_up( @@ -480,9 +491,9 @@ def down( stopped_message='All services should have terminated.') service_names_str = ','.join(service_names) - if sum([len(service_names) > 0, all]) != 1: - argument_str = f'service_names={service_names_str}' if len( - service_names) > 0 else '' + if sum([bool(service_names), all]) != 1: + argument_str = (f'service_names={service_names_str}' + if service_names else '') argument_str += ' all' if all else '' raise ValueError('Can only specify one of service_names or all. ' f'Provided {argument_str!r}.') @@ -581,9 +592,10 @@ def status( 'status': (sky.ServiceStatus) service status, 'controller_port': (Optional[int]) controller port, 'load_balancer_port': (Optional[int]) load balancer port, - 'policy': (Optional[str]) load balancer policy description, + 'policy': (Optional[str]) autoscaling policy description, 'requested_resources_str': (str) str representation of requested resources, + 'load_balancing_policy': (str) load balancing policy name, 'replica_info': (List[Dict[str, Any]]) replica information, } diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 30697532a22..6b4621569d6 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -45,6 +45,8 @@ def __init__(self, # Use the registry to create the load balancing policy self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make( load_balancing_policy_name) + logger.info('Starting load balancer with policy ' + f'{load_balancing_policy_name}.') self._request_aggregator: serve_utils.RequestsAggregator = ( serve_utils.RequestTimestamp()) # TODO(tian): httpx.Client has a resource limit of 100 max connections @@ -128,6 +130,7 @@ async def _proxy_request_to( encountered if anything goes wrong. """ logger.info(f'Proxy request to {url}') + self._load_balancing_policy.pre_execute_hook(url, request) try: # We defer the get of the client here on purpose, for case when the # replica is ready in `_proxy_with_retries` but refreshed before @@ -147,11 +150,16 @@ async def _proxy_request_to( content=await request.body(), timeout=constants.LB_STREAM_TIMEOUT) proxy_response = await client.send(proxy_request, stream=True) + + async def background_func(): + await proxy_response.aclose() + self._load_balancing_policy.post_execute_hook(url, request) + return fastapi.responses.StreamingResponse( content=proxy_response.aiter_raw(), status_code=proxy_response.status_code, headers=proxy_response.headers, - background=background.BackgroundTask(proxy_response.aclose)) + background=background.BackgroundTask(background_func)) except (httpx.RequestError, httpx.HTTPStatusError) as e: logger.error(f'Error when proxy request to {url}: ' f'{common_utils.format_exception(e)}') @@ -263,7 +271,7 @@ def run_load_balancer(controller_addr: str, parser.add_argument( '--load-balancing-policy', choices=available_policies, - default='round_robin', + default=lb_policies.DEFAULT_LB_POLICY, help=f'The load balancing policy to use. Available policies: ' f'{", ".join(available_policies)}.') args = parser.parse_args() diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index aec6eb01487..4ad69f78943 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -1,7 +1,9 @@ """LoadBalancingPolicy: Policy to select endpoint.""" +import collections import random +import threading import typing -from typing import List, Optional +from typing import Dict, List, Optional from sky import sky_logging @@ -13,6 +15,10 @@ # Define a registry for load balancing policies LB_POLICIES = {} DEFAULT_LB_POLICY = None +# Prior to #4439, the default policy was round_robin. We store the legacy +# default policy here to maintain backwards compatibility. Remove this after +# 2 minor release, i.e., 0.9.0. +LEGACY_DEFAULT_POLICY = 'round_robin' def _request_repr(request: 'fastapi.Request') -> str: @@ -38,11 +44,17 @@ def __init_subclass__(cls, name: str, default: bool = False): DEFAULT_LB_POLICY = name @classmethod - def make(cls, policy_name: Optional[str] = None) -> 'LoadBalancingPolicy': - """Create a load balancing policy from a name.""" + def make_policy_name(cls, policy_name: Optional[str]) -> str: + """Return the policy name.""" + assert DEFAULT_LB_POLICY is not None, 'No default policy set.' if policy_name is None: - policy_name = DEFAULT_LB_POLICY + return DEFAULT_LB_POLICY + return policy_name + @classmethod + def make(cls, policy_name: Optional[str] = None) -> 'LoadBalancingPolicy': + """Create a load balancing policy from a name.""" + policy_name = cls.make_policy_name(policy_name) if policy_name not in LB_POLICIES: raise ValueError(f'Unknown load balancing policy: {policy_name}') return LB_POLICIES[policy_name]() @@ -65,8 +77,16 @@ def select_replica(self, request: 'fastapi.Request') -> Optional[str]: def _select_replica(self, request: 'fastapi.Request') -> Optional[str]: raise NotImplementedError + def pre_execute_hook(self, replica_url: str, + request: 'fastapi.Request') -> None: + pass + + def post_execute_hook(self, replica_url: str, + request: 'fastapi.Request') -> None: + pass + -class RoundRobinPolicy(LoadBalancingPolicy, name='round_robin', default=True): +class RoundRobinPolicy(LoadBalancingPolicy, name='round_robin'): """Round-robin load balancing policy.""" def __init__(self) -> None: @@ -90,3 +110,43 @@ def _select_replica(self, request: 'fastapi.Request') -> Optional[str]: ready_replica_url = self.ready_replicas[self.index] self.index = (self.index + 1) % len(self.ready_replicas) return ready_replica_url + + +class LeastLoadPolicy(LoadBalancingPolicy, name='least_load', default=True): + """Least load load balancing policy.""" + + def __init__(self) -> None: + super().__init__() + self.load_map: Dict[str, int] = collections.defaultdict(int) + self.lock = threading.Lock() + + def set_ready_replicas(self, ready_replicas: List[str]) -> None: + if set(self.ready_replicas) == set(ready_replicas): + return + with self.lock: + self.ready_replicas = ready_replicas + for r in self.ready_replicas: + if r not in ready_replicas: + del self.load_map[r] + for replica in ready_replicas: + self.load_map[replica] = self.load_map.get(replica, 0) + + def _select_replica(self, request: 'fastapi.Request') -> Optional[str]: + del request # Unused. + if not self.ready_replicas: + return None + with self.lock: + return min(self.ready_replicas, + key=lambda replica: self.load_map.get(replica, 0)) + + def pre_execute_hook(self, replica_url: str, + request: 'fastapi.Request') -> None: + del request # Unused. + with self.lock: + self.load_map[replica_url] += 1 + + def post_execute_hook(self, replica_url: str, + request: 'fastapi.Request') -> None: + del request # Unused. + with self.lock: + self.load_map[replica_url] -= 1 diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index c0e5220e779..5f92dda0e2f 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -172,7 +172,7 @@ def _get_resources_ports(task_yaml: str) -> str: """Get the resources ports used by the task.""" task = sky.Task.from_yaml(task_yaml) # Already checked all ports are the same in sky.serve.core.up - assert len(task.resources) >= 1, task + assert task.resources, task task_resources: 'resources.Resources' = list(task.resources)[0] # Already checked the resources have and only have one port # before upload the task yaml. diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 333e0138fb4..f3e8fbf1e53 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -11,6 +11,7 @@ import colorama from sky.serve import constants +from sky.serve import load_balancing_policies as lb_policies from sky.utils import db_utils if typing.TYPE_CHECKING: @@ -76,6 +77,8 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None: db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services', 'active_versions', f'TEXT DEFAULT {json.dumps([])!r}') +db_utils.add_column_to_table(_DB.cursor, _DB.conn, 'services', + 'load_balancing_policy', 'TEXT DEFAULT NULL') _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG = 'UNIQUE constraint failed: services.name' @@ -223,7 +226,7 @@ def from_replica_statuses( for status in ReplicaStatus.failed_statuses()) > 0: return cls.FAILED # When min_replicas = 0, there is no (provisioning) replica. - if len(replica_statuses) == 0: + if not replica_statuses: return cls.NO_REPLICA return cls.REPLICA_INIT @@ -241,7 +244,8 @@ def from_replica_statuses( def add_service(name: str, controller_job_id: int, policy: str, - requested_resources_str: str, status: ServiceStatus) -> bool: + requested_resources_str: str, load_balancing_policy: str, + status: ServiceStatus) -> bool: """Add a service in the database. Returns: @@ -254,10 +258,10 @@ def add_service(name: str, controller_job_id: int, policy: str, """\ INSERT INTO services (name, controller_job_id, status, policy, - requested_resources_str) - VALUES (?, ?, ?, ?, ?)""", + requested_resources_str, load_balancing_policy) + VALUES (?, ?, ?, ?, ?, ?)""", (name, controller_job_id, status.value, policy, - requested_resources_str)) + requested_resources_str, load_balancing_policy)) except sqlite3.IntegrityError as e: if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG: @@ -324,7 +328,12 @@ def set_service_load_balancer_port(service_name: str, def _get_service_from_row(row) -> Dict[str, Any]: (current_version, name, controller_job_id, controller_port, load_balancer_port, status, uptime, policy, _, _, requested_resources_str, - _, active_versions) = row[:13] + _, active_versions, load_balancing_policy) = row[:14] + if load_balancing_policy is None: + # This entry in database was added in #4439, and it will always be set + # to a str value. If it is None, it means it is an legacy entry and is + # using the legacy default policy. + load_balancing_policy = lb_policies.LEGACY_DEFAULT_POLICY return { 'name': name, 'controller_job_id': controller_job_id, @@ -341,6 +350,7 @@ def _get_service_from_row(row) -> Dict[str, Any]: # integers in json format. This is mainly for display purpose. 'active_versions': json.loads(active_versions), 'requested_resources_str': requested_resources_str, + 'load_balancing_policy': load_balancing_policy, } diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 6ab932f278a..35d2c25ff40 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -110,7 +110,7 @@ class UpdateMode(enum.Enum): class ThreadSafeDict(Generic[KeyType, ValueType]): """A thread-safe dict.""" - def __init__(self, *args, **kwargs) -> None: + def __init__(self, *args: Any, **kwargs: Any) -> None: self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs) self._lock = threading.Lock() @@ -383,7 +383,7 @@ def _get_service_status( def get_service_status_encoded(service_names: Optional[List[str]]) -> str: - service_statuses = [] + service_statuses: List[Dict[str, str]] = [] if service_names is None: # Get all service names service_names = serve_state.get_glob_service_names(None) @@ -400,7 +400,7 @@ def get_service_status_encoded(service_names: Optional[List[str]]) -> str: def load_service_status(payload: str) -> List[Dict[str, Any]]: service_statuses_encoded = common_utils.decode_payload(payload) - service_statuses = [] + service_statuses: List[Dict[str, Any]] = [] for service_status in service_statuses_encoded: service_statuses.append({ k: pickle.loads(base64.b64decode(v)) @@ -432,7 +432,7 @@ def _terminate_failed_services( A message indicating potential resource leak (if any). If no resource leak is detected, return None. """ - remaining_replica_clusters = [] + remaining_replica_clusters: List[str] = [] # The controller should have already attempted to terminate those # replicas, so we don't need to try again here. for replica_info in serve_state.get_replica_infos(service_name): @@ -459,8 +459,8 @@ def _terminate_failed_services( def terminate_services(service_names: Optional[List[str]], purge: bool) -> str: service_names = serve_state.get_glob_service_names(service_names) - terminated_service_names = [] - messages = [] + terminated_service_names: List[str] = [] + messages: List[str] = [] for service_name in service_names: service_status = _get_service_status(service_name, with_replica_info=False) @@ -506,7 +506,7 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str: f.write(UserSignal.TERMINATE.value) f.flush() terminated_service_names.append(f'{service_name!r}') - if len(terminated_service_names) == 0: + if not terminated_service_names: messages.append('No service to terminate.') else: identity_str = f'Service {terminated_service_names[0]} is' @@ -784,9 +784,9 @@ def get_endpoint(service_record: Dict[str, Any]) -> str: # Don't use backend_utils.is_controller_accessible since it is too slow. handle = global_user_state.get_handle_from_cluster_name( SKY_SERVE_CONTROLLER_NAME) - assert isinstance(handle, backends.CloudVmRayResourceHandle) if handle is None: return '-' + assert isinstance(handle, backends.CloudVmRayResourceHandle) load_balancer_port = service_record['load_balancer_port'] if load_balancer_port is None: return '-' @@ -811,10 +811,12 @@ def format_service_table(service_records: List[Dict[str, Any]], 'NAME', 'VERSION', 'UPTIME', 'STATUS', 'REPLICAS', 'ENDPOINT' ] if show_all: - service_columns.extend(['POLICY', 'REQUESTED_RESOURCES']) + service_columns.extend([ + 'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES' + ]) service_table = log_utils.create_table(service_columns) - replica_infos = [] + replica_infos: List[Dict[str, Any]] = [] for record in service_records: for replica in record['replica_info']: replica['service_name'] = record['name'] @@ -832,6 +834,7 @@ def format_service_table(service_records: List[Dict[str, Any]], endpoint = get_endpoint(record) policy = record['policy'] requested_resources_str = record['requested_resources_str'] + load_balancing_policy = record['load_balancing_policy'] service_values = [ service_name, @@ -842,7 +845,8 @@ def format_service_table(service_records: List[Dict[str, Any]], endpoint, ] if show_all: - service_values.extend([policy, requested_resources_str]) + service_values.extend( + [policy, load_balancing_policy, requested_resources_str]) service_table.add_row(service_values) replica_table = _format_replica_table(replica_infos, show_all) @@ -884,7 +888,8 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], region = '-' zone = '-' - replica_handle: 'backends.CloudVmRayResourceHandle' = record['handle'] + replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[ + 'handle'] if replica_handle is not None: resources_str = resources_utils.get_readable_resources_repr( replica_handle, simplify=not show_all) diff --git a/sky/serve/service.py b/sky/serve/service.py index 0a1c7f34766..dbfc57b22bf 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -150,6 +150,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int): controller_job_id=job_id, policy=service_spec.autoscaling_policy_str(), requested_resources_str=backend_utils.get_task_resources_str(task), + load_balancing_policy=service_spec.load_balancing_policy, status=serve_state.ServiceStatus.CONTROLLER_INIT) # Directly throw an error here. See sky/serve/api.py::up # for more details. diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 000eed139f1..41de54cf806 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -2,12 +2,13 @@ import json import os import textwrap -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import yaml from sky import serve from sky.serve import constants +from sky.serve import load_balancing_policies as lb_policies from sky.utils import common_utils from sky.utils import schemas from sky.utils import ux_utils @@ -185,9 +186,12 @@ def from_yaml(yaml_path: str) -> 'SkyServiceSpec': return SkyServiceSpec.from_yaml_config(config['service']) def to_yaml_config(self) -> Dict[str, Any]: - config = dict() + config: Dict[str, Any] = {} - def add_if_not_none(section, key, value, no_empty: bool = False): + def add_if_not_none(section: str, + key: Optional[str], + value: Any, + no_empty: bool = False): if no_empty and not value: return if value is not None: @@ -230,8 +234,8 @@ def probe_str(self): ' with custom headers') return f'{method}{headers}' - def spot_policy_str(self): - policy_strs = [] + def spot_policy_str(self) -> str: + policy_strs: List[str] = [] if (self.dynamic_ondemand_fallback is not None and self.dynamic_ondemand_fallback): policy_strs.append('Dynamic on-demand fallback') @@ -327,5 +331,6 @@ def use_ondemand_fallback(self) -> bool: return self._use_ondemand_fallback @property - def load_balancing_policy(self) -> Optional[str]: - return self._load_balancing_policy + def load_balancing_policy(self) -> str: + return lb_policies.LoadBalancingPolicy.make_policy_name( + self._load_balancing_policy) diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index 121f96d8e8b..0770da28c43 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -186,6 +186,6 @@ def parse_readme(readme: str) -> str: 'Homepage': 'https://github.com/skypilot-org/skypilot', 'Issues': 'https://github.com/skypilot-org/skypilot/issues', 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions', - 'Documentation': 'https://skypilot.readthedocs.io/en/latest/', + 'Documentation': 'https://docs.skypilot.co/', }, ) diff --git a/sky/sky_logging.py b/sky/sky_logging.py index effeab310d8..8c6ac6911d9 100644 --- a/sky/sky_logging.py +++ b/sky/sky_logging.py @@ -1,12 +1,15 @@ """Logging utilities.""" import builtins import contextlib +from datetime import datetime import logging +import os import sys import threading import colorama +from sky.skylet import constants from sky.utils import env_options from sky.utils import rich_utils @@ -113,7 +116,7 @@ def reload_logger(): _setup_logger() -def init_logger(name: str): +def init_logger(name: str) -> logging.Logger: return logging.getLogger(name) @@ -161,3 +164,16 @@ def is_silent(): # threads. _logging_config.is_silent = False return _logging_config.is_silent + + +def get_run_timestamp() -> str: + return 'sky-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') + + +def generate_tmp_logging_file_path(file_name: str) -> str: + """Generate an absolute path of a tmp file for logging.""" + run_timestamp = get_run_timestamp() + log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp) + log_path = os.path.expanduser(os.path.join(log_dir, file_name)) + + return log_path diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py index dfd8332b019..65311688fb4 100644 --- a/sky/skylet/job_lib.py +++ b/sky/skylet/job_lib.py @@ -586,7 +586,7 @@ def update_job_status(job_ids: List[int], This function should only be run on the remote instance with ray>=2.4.0. """ echo = logger.info if not silent else logger.debug - if len(job_ids) == 0: + if not job_ids: return [] statuses = [] diff --git a/sky/skylet/providers/ibm/node_provider.py b/sky/skylet/providers/ibm/node_provider.py index 5e2a2d64493..44622369e92 100644 --- a/sky/skylet/providers/ibm/node_provider.py +++ b/sky/skylet/providers/ibm/node_provider.py @@ -377,7 +377,7 @@ def non_terminated_nodes(self, tag_filters) -> List[str]: node["id"], nic_id ).get_result() floating_ips = res["floating_ips"] - if len(floating_ips) == 0: + if not floating_ips: # not adding a node that's yet/failed to # to get a floating ip provisioned continue @@ -485,7 +485,7 @@ def _get_instance_data(self, name): """Returns instance (node) information matching the specified name""" instances_data = self.ibm_vpc_client.list_instances(name=name).get_result() - if len(instances_data["instances"]) > 0: + if instances_data["instances"]: return instances_data["instances"][0] return None diff --git a/sky/skylet/providers/scp/config.py b/sky/skylet/providers/scp/config.py index c20b1837f26..d19744e7322 100644 --- a/sky/skylet/providers/scp/config.py +++ b/sky/skylet/providers/scp/config.py @@ -107,7 +107,7 @@ def get_vcp_subnets(self): for item in subnet_contents if item['subnetState'] == 'ACTIVE' and item["vpcId"] == vpc ] - if len(subnet_list) > 0: + if subnet_list: vpc_subnets[vpc] = subnet_list return vpc_subnets diff --git a/sky/skylet/providers/scp/node_provider.py b/sky/skylet/providers/scp/node_provider.py index 004eaac3830..f99b477ab06 100644 --- a/sky/skylet/providers/scp/node_provider.py +++ b/sky/skylet/providers/scp/node_provider.py @@ -259,7 +259,7 @@ def _config_security_group(self, zone_id, vpc, cluster_name): for sg in sg_contents if sg["securityGroupId"] == sg_id ] - if len(sg) != 0 and sg[0] == "ACTIVE": + if sg and sg[0] == "ACTIVE": break time.sleep(5) @@ -282,16 +282,16 @@ def _del_security_group(self, sg_id): for sg in sg_contents if sg["securityGroupId"] == sg_id ] - if len(sg) == 0: + if not sg: break def _refresh_security_group(self, vms): - if len(vms) > 0: + if vms: return # remove security group if vm does not exist keys = self.metadata.keys() security_group_id = self.metadata[ - keys[0]]['creation']['securityGroupId'] if len(keys) > 0 else None + keys[0]]['creation']['securityGroupId'] if keys else None if security_group_id: try: self._del_security_group(security_group_id) @@ -308,7 +308,7 @@ def _del_vm(self, vm_id): for vm in vm_contents if vm["virtualServerId"] == vm_id ] - if len(vms) == 0: + if not vms: break def _del_firwall_rules(self, firewall_id, rule_ids): @@ -391,7 +391,7 @@ def _create_instance_sequence(self, vpc, instance_config): return None, None, None, None def _undo_funcs(self, undo_func_list): - while len(undo_func_list) > 0: + while undo_func_list: func = undo_func_list.pop() func() @@ -468,7 +468,7 @@ def create_node(self, node_config: Dict[str, Any], tags: Dict[str, str], zone_config = ZoneConfig(self.scp_client, node_config) vpc_subnets = zone_config.get_vcp_subnets() - if (len(vpc_subnets) == 0): + if not vpc_subnets: raise SCPError("This region/zone does not have available VPCs.") instance_config = zone_config.bootstrap_instance_config(node_config) diff --git a/sky/skypilot_config.py b/sky/skypilot_config.py index aae62afc616..e973754f4c9 100644 --- a/sky/skypilot_config.py +++ b/sky/skypilot_config.py @@ -238,7 +238,7 @@ def _try_load_config() -> None: _dict, schemas.get_config_schema(), f'Invalid config YAML ({config_path}). See: ' - 'https://skypilot.readthedocs.io/en/latest/reference/config.html. ' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/reference/config.html. ' # pylint: disable=line-too-long 'Error: ', skip_none=False) diff --git a/sky/task.py b/sky/task.py index 4cc8b43db0b..68c50e05545 100644 --- a/sky/task.py +++ b/sky/task.py @@ -956,7 +956,7 @@ def sync_storage_mounts(self) -> None: }``. """ for storage in self.storage_mounts.values(): - if len(storage.stores) == 0: + if not storage.stores: store_type, store_region = self._get_preferred_store() self.storage_plans[storage] = store_type storage.add_store(store_type, store_region) diff --git a/sky/utils/accelerator_registry.py b/sky/utils/accelerator_registry.py index 78a708efb91..6f4a80cb886 100644 --- a/sky/utils/accelerator_registry.py +++ b/sky/utils/accelerator_registry.py @@ -106,7 +106,7 @@ def canonicalize_accelerator_name(accelerator: str, return names[0] # Do not print an error message here. Optimizer will handle it. - if len(names) == 0: + if not names: return accelerator # Currently unreachable. diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index 3fcdd24e505..ee8f5cf7bec 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -633,7 +633,7 @@ def get_cleaned_username(username: str = '') -> str: return username -def fill_template(template_name: str, variables: Dict, +def fill_template(template_name: str, variables: Dict[str, Any], output_path: str) -> None: """Create a file from a Jinja template and return the filename.""" assert template_name.endswith('.j2'), template_name diff --git a/sky/utils/dag_utils.py b/sky/utils/dag_utils.py index 3229f86abf9..d0eb03d46ea 100644 --- a/sky/utils/dag_utils.py +++ b/sky/utils/dag_utils.py @@ -89,7 +89,7 @@ def load_chain_dag_from_yaml( elif len(configs) == 1: dag_name = configs[0].get('name') - if len(configs) == 0: + if not configs: # YAML has only `name: xxx`. Still instantiate a task. configs = [{'name': dag_name}] diff --git a/sky/utils/kubernetes/deploy_remote_cluster.sh b/sky/utils/kubernetes/deploy_remote_cluster.sh index 94736474289..8d7ba3e5729 100755 --- a/sky/utils/kubernetes/deploy_remote_cluster.sh +++ b/sky/utils/kubernetes/deploy_remote_cluster.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Refer to https://skypilot.readthedocs.io/en/latest/reservations/existing-machines.html for details on how to use this script. +# Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script. set -e # Colors for nicer UX diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py index 14fbbdedca5..9f5a11cea42 100644 --- a/sky/utils/kubernetes/gpu_labeler.py +++ b/sky/utils/kubernetes/gpu_labeler.py @@ -101,7 +101,7 @@ def label(): # Get the list of nodes with GPUs gpu_nodes = [] for node in nodes: - if kubernetes_utils.GPU_RESOURCE_KEY in node.status.capacity: + if kubernetes_utils.get_gpu_resource_key() in node.status.capacity: gpu_nodes.append(node) print(f'Found {len(gpu_nodes)} GPU nodes in the cluster') @@ -115,7 +115,7 @@ def label(): print('Continuing without using nvidia RuntimeClass. ' 'This may fail on K3s clusters. ' 'For more details, refer to K3s deployment notes at: ' - 'https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html') # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html') # pylint: disable=line-too-long nvidia_exists = False if nvidia_exists: @@ -139,10 +139,10 @@ def label(): # Create the job for this node` batch_v1.create_namespaced_job(namespace, job_manifest) print(f'Created GPU labeler job for node {node_name}') - if len(gpu_nodes) == 0: + if not gpu_nodes: print('No GPU nodes found in the cluster. If you have GPU nodes, ' 'please ensure that they have the label ' - f'`{kubernetes_utils.GPU_RESOURCE_KEY}: `') + f'`{kubernetes_utils.get_gpu_resource_key()}: `') else: print('GPU labeling started - this may take 10 min or more to complete.' '\nTo check the status of GPU labeling jobs, run ' diff --git a/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py b/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py index 380c82f8c88..a764fb6e5e4 100644 --- a/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +++ b/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py @@ -126,7 +126,7 @@ def manage_lifecycle(): f'error: {e}\n') raise - if len(ret.items) == 0: + if not ret.items: sys.stdout.write( f'[Lifecycle] Did not find pods with label ' f'"{label_selector}" in namespace {current_namespace}\n') diff --git a/tests/skyserve/load_balancer/service.yaml b/tests/skyserve/load_balancer/service.yaml index 742b8efd2f4..232136d4a61 100644 --- a/tests/skyserve/load_balancer/service.yaml +++ b/tests/skyserve/load_balancer/service.yaml @@ -5,6 +5,7 @@ service: initial_delay_seconds: 180 replica_policy: min_replicas: 3 + load_balancing_policy: round_robin resources: ports: 8080 diff --git a/tests/skyserve/update/new.yaml b/tests/skyserve/update/new.yaml index 2c9cebd0cb5..5e5d853e09d 100644 --- a/tests/skyserve/update/new.yaml +++ b/tests/skyserve/update/new.yaml @@ -3,6 +3,7 @@ service: path: /health initial_delay_seconds: 100 replicas: 2 + load_balancing_policy: round_robin resources: ports: 8081 diff --git a/tests/skyserve/update/old.yaml b/tests/skyserve/update/old.yaml index 4b99cb92e8c..4cb19b8327b 100644 --- a/tests/skyserve/update/old.yaml +++ b/tests/skyserve/update/old.yaml @@ -3,6 +3,7 @@ service: path: /health initial_delay_seconds: 20 replicas: 2 + load_balancing_policy: round_robin resources: ports: 8080 diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py index 0255884ae30..18b82c649e7 100644 --- a/tests/smoke_tests/test_cluster_job.py +++ b/tests/smoke_tests/test_cluster_job.py @@ -1299,6 +1299,34 @@ def test_use_spot(generic_cloud: str): smoke_tests_utils.run_one_test(test) +@pytest.mark.azure +def test_azure_spot_instance_verification(): + """Test Azure spot instance provisioning with explicit verification. + This test verifies that when --use-spot is specified for Azure: + 1. The cluster launches successfully + 2. The instances are actually provisioned as spot instances + """ + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( + 'azure-spot-verification', + [ + f'sky launch -c {name} --cloud azure tests/test_yamls/minimal.yaml --use-spot -y', + f'sky logs {name} 1 --status', f'TARGET_VM_NAME="{name}"; ' + 'VM_INFO=$(az vm list --query "[?contains(name, \'$TARGET_VM_NAME\')].{Name:name, ResourceGroup:resourceGroup}" -o tsv); ' + '[[ -z "$VM_INFO" ]] && exit 1; ' + 'FULL_VM_NAME=$(echo "$VM_INFO" | awk \'{print $1}\'); ' + 'RESOURCE_GROUP=$(echo "$VM_INFO" | awk \'{print $2}\'); ' + 'VM_DETAILS=$(az vm list --resource-group "$RESOURCE_GROUP" ' + '--query "[?name==\'$FULL_VM_NAME\'].{Name:name, Location:location, Priority:priority}" -o table); ' + '[[ -z "$VM_DETAILS" ]] && exit 1; ' + 'echo "VM Details:"; echo "$VM_DETAILS"; ' + 'echo "$VM_DETAILS" | grep -qw "Spot" && exit 0 || exit 1' + ], + f'sky down -y {name}', + ) + smoke_tests_utils.run_one_test(test) + + @pytest.mark.gcp def test_stop_gcp_spot(): """Test GCP spot can be stopped, autostopped, restarted.""" diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_quick_tests_core.py similarity index 63% rename from tests/smoke_tests/test_pre_merge.py rename to tests/smoke_tests/test_quick_tests_core.py index 4890ac15ce4..48df4ef9a2b 100644 --- a/tests/smoke_tests/test_pre_merge.py +++ b/tests/smoke_tests/test_quick_tests_core.py @@ -1,23 +1,27 @@ # Smoke tests for SkyPilot required before merging +# If the change includes an interface modification or touches the core API, +# the reviewer could decide it’s necessary to trigger a pre-merge test and +# leave a comment /quicktest-core will then trigger this test. +# # Default options are set in pyproject.toml # Example usage: # Run all tests except for AWS and Lambda Cloud -# > pytest tests/smoke_tests/test_pre_merge.py +# > pytest tests/smoke_tests/test_quick_tests_core.py # # Terminate failed clusters after test finishes -# > pytest tests/smoke_tests/test_pre_merge.py --terminate-on-failure +# > pytest tests/smoke_tests/test_quick_tests_core.py --terminate-on-failure # # Re-run last failed tests # > pytest --lf # # Run one of the smoke tests -# > pytest tests/smoke_tests/test_pre_merge.py::test_yaml_launch_and_mount +# > pytest tests/smoke_tests/test_quick_tests_core.py::test_yaml_launch_and_mount # # Only run test for AWS + generic tests -# > pytest tests/smoke_tests/test_pre_merge.py --aws +# > pytest tests/smoke_tests/test_quick_tests_core.py --aws # # Change cloud for generic tests to aws -# > pytest tests/smoke_tests/test_pre_merge.py --generic-cloud aws +# > pytest tests/smoke_tests/test_quick_tests_core.py --generic-cloud aws from smoke_tests import smoke_tests_utils @@ -29,7 +33,7 @@ def test_yaml_launch_and_mount(generic_cloud: str): test = smoke_tests_utils.Test( 'test_yaml_launch_and_mount', [ - f'sky launch -y -c {name} tests/test_yamls/minimal_test_pre_merge.yaml', + f'sky launch -y -c {name} tests/test_yamls/minimal_test_quick_tests_core.yaml', smoke_tests_utils. get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, diff --git a/tests/test_yaml_parser.py b/tests/test_yaml_parser.py index 7d304b60633..a9fad1b4b83 100644 --- a/tests/test_yaml_parser.py +++ b/tests/test_yaml_parser.py @@ -96,8 +96,8 @@ def test_empty_fields_storage(tmp_path): storage = task.storage_mounts['/mystorage'] assert storage.name == 'sky-dataset' assert storage.source is None - assert len(storage.stores) == 0 - assert storage.persistent is True + assert not storage.stores + assert storage.persistent def test_invalid_fields_storage(tmp_path): diff --git a/tests/test_yamls/minimal_test_pre_merge.yaml b/tests/test_yamls/minimal_test_quick_tests_core.yaml similarity index 62% rename from tests/test_yamls/minimal_test_pre_merge.yaml rename to tests/test_yamls/minimal_test_quick_tests_core.yaml index 583575bee5c..15857e972dd 100644 --- a/tests/test_yamls/minimal_test_pre_merge.yaml +++ b/tests/test_yamls/minimal_test_quick_tests_core.yaml @@ -10,4 +10,4 @@ workdir: . num_nodes: 1 run: | - ls -l ~/aws/tests/test_yamls/minimal_test_pre_merge.yaml + ls -l ~/aws/tests/test_yamls/minimal_test_quick_tests_core.yaml diff --git a/tests/unit_tests/sky/adaptors/test_oci.py b/tests/unit_tests/sky/adaptors/test_oci.py new file mode 100644 index 00000000000..59c2b1f99b7 --- /dev/null +++ b/tests/unit_tests/sky/adaptors/test_oci.py @@ -0,0 +1,65 @@ +"""Tests for OCI adaptor.""" +import logging + +import pytest + +from sky import check as sky_check +from sky.adaptors import oci +from sky.utils import log_utils + + +def test_oci_circuit_breaker_logging(): + """Test that OCI circuit breaker logging is properly configured.""" + # Get the circuit breaker logger + logger = logging.getLogger('oci.circuit_breaker') + + # Create a handler that captures log records + log_records = [] + test_handler = logging.Handler() + test_handler.emit = lambda record: log_records.append(record) + logger.addHandler(test_handler) + + # Create a null handler to suppress logs during import + null_handler = logging.NullHandler() + logger.addHandler(null_handler) + + try: + # Verify logger starts at WARNING level (set by adaptor initialization) + initial_level = logger.getEffectiveLevel() + print( + f'Initial logger level: {initial_level} (WARNING={logging.WARNING})' + ) + assert initial_level == logging.WARNING, ( + 'OCI circuit breaker logger should be set to WARNING before initialization' + ) + + # Force OCI module import through LazyImport by accessing a module attribute + print('Attempting to import OCI module...') + try: + # This will trigger LazyImport's load_module for the actual OCI module + _ = oci.oci.config.DEFAULT_LOCATION + except (ImportError, AttributeError) as e: + # Expected when OCI SDK is not installed + print(f'Import/Attribute error as expected: {e}') + pass + + # Verify logger level after import attempt + after_level = logger.getEffectiveLevel() + print( + f'Logger level after import: {after_level} (WARNING={logging.WARNING})' + ) + assert after_level == logging.WARNING, ( + 'OCI circuit breaker logger should remain at WARNING after initialization' + ) + + # Verify no circuit breaker logs were emitted + circuit_breaker_logs = [ + record for record in log_records + if 'Circuit breaker' in record.getMessage() + ] + assert not circuit_breaker_logs, ( + 'No circuit breaker logs should be emitted during initialization') + finally: + # Clean up the handlers + logger.removeHandler(test_handler) + logger.removeHandler(null_handler) diff --git a/tests/unit_tests/test_storage_utils.py b/tests/unit_tests/test_storage_utils.py index cd1e436390b..6edb5abf2f5 100644 --- a/tests/unit_tests/test_storage_utils.py +++ b/tests/unit_tests/test_storage_utils.py @@ -7,7 +7,7 @@ def test_get_excluded_files_from_skyignore_no_file(): excluded_files = storage_utils.get_excluded_files_from_skyignore('.') - assert len(excluded_files) == 0 + assert not excluded_files def test_get_excluded_files_from_skyignore():