separate auxiliary functions

catalystneuro · Sep 21, 2023 · 0ad021a · 0ad021a
1 parent ffe609e
commit 0ad021a
Show file tree

Hide file tree

Showing 10 changed files with 173 additions and 194 deletions.
diff --git a/containers/Dockerfile.combined b/containers/Dockerfile.combined
@@ -37,13 +37,13 @@ COPY requirements.txt .
 RUN pip install -r requirements.txt
 
 WORKDIR /app
-COPY run_script.py .
+COPY main.py .
+COPY utils.py .
 COPY light_server.py .
 RUN mkdir /data
 RUN mkdir /logs
 
 # Get Python stdout logs
 ENV PYTHONUNBUFFERED=1
 
-CMD ["python", "light_server.py"]
-# CMD ["uvicorn", "light_server:app", "--host", "0.0.0.0", "--port", "5000", "--reload"]
+CMD ["python", "light_server.py"]
diff --git a/containers/Dockerfile_ks2_5 → containers/Dockerfile.ks2_5 b/containers/Dockerfile_ks2_5 → containers/Dockerfile.ks2_5
@@ -1,5 +1,5 @@
 # Spike sorters image
-FROM spikeinterface/kilosort2_5-compiled-base as ks25base
+FROM spikeinterface/kilosort2_5-compiled-base:0.2.0 as ks25base
 
 # NVIDIA-ready Image
 FROM nvidia/cuda:11.6.2-base-ubuntu20.04
@@ -12,9 +12,8 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ENV MINICONDA_VERSION 4.8.2
 ENV CONDA_DIR /home/miniconda3
-ENV LATEST_CONDA_SCRIPT "Miniconda3-py38_$MINICONDA_VERSION-Linux-x86_64.sh"
+ENV LATEST_CONDA_SCRIPT "Miniconda3-py39_23.5.2-0-Linux-x86_64.sh"
 
 RUN wget --quiet https://repo.anaconda.com/miniconda/$LATEST_CONDA_SCRIPT -O ~/miniconda.sh && \
     bash ~/miniconda.sh -b -p $CONDA_DIR && \
@@ -29,20 +28,20 @@ ENV PATH="/opt/matlabruntime:${PATH}"
 COPY --from=ks25base /usr/lib/x86_64-linux-gnu/libXt.so.6 /usr/lib/x86_64-linux-gnu/libXt.so.6
 COPY --from=ks25base /usr/lib/x86_64-linux-gnu/libSM.so.6 /usr/lib/x86_64-linux-gnu/libSM.so.6
 COPY --from=ks25base /usr/lib/x86_64-linux-gnu/libICE.so.6 /usr/lib/x86_64-linux-gnu/libICE.so.6
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/matlabruntime/v911/runtime/glnxa64:/opt/matlabruntime/v911/bin/glnxa64:/opt/matlabruntime/v911/sys/os/glnxa64:/opt/matlabruntime/v911/sys/opengl/lib/glnxa64:/opt/matlabruntime/v911/extern/bin/glnxa64
-
-# Get Python stdout logs
-ENV PYTHONUNBUFFERED=1
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/matlabruntime/R2022b/runtime/glnxa64:/opt/matlabruntime/R2022b/bin/glnxa64:/opt/matlabruntime/R2022b/sys/os/glnxa64:/opt/matlabruntime/R2022b/sys/opengl/lib/glnxa64:/opt/matlabruntime/R2022b/extern/bin/glnxa64
 
 # Copy requirements and script
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 
 WORKDIR /app
-COPY run_script.py .
+COPY main.py .
+COPY utils.py .
 COPY light_server.py .
 RUN mkdir /data
 RUN mkdir /logs
 
-CMD ["python", "light_server.py"]
-# ENTRYPOINT ["python", "run_script.py"]
+# Get Python stdout logs
+ENV PYTHONUNBUFFERED=1
+
+CMD ["python", "light_server.py"]
diff --git a/containers/Dockerfile_ks3 → containers/Dockerfile.ks3 b/containers/Dockerfile_ks3 → containers/Dockerfile.ks3
@@ -1,5 +1,5 @@
 # Spike sorter image
-FROM spikeinterface/kilosort3-compiled-base as ksbase
+FROM spikeinterface/kilosort3-compiled-base:0.2.0 as ksbase
 
 # NVIDIA-ready Image
 FROM nvidia/cuda:11.6.2-base-ubuntu20.04
@@ -12,9 +12,8 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ENV MINICONDA_VERSION 4.8.2
 ENV CONDA_DIR /home/miniconda3
-ENV LATEST_CONDA_SCRIPT "Miniconda3-py38_$MINICONDA_VERSION-Linux-x86_64.sh"
+ENV LATEST_CONDA_SCRIPT "Miniconda3-py39_23.5.2-0-Linux-x86_64.sh"
 
 RUN wget --quiet https://repo.anaconda.com/miniconda/$LATEST_CONDA_SCRIPT -O ~/miniconda.sh && \
     bash ~/miniconda.sh -b -p $CONDA_DIR && \
@@ -29,20 +28,20 @@ ENV PATH="/opt/matlabruntime:${PATH}"
 COPY --from=ksbase /usr/lib/x86_64-linux-gnu/libXt.so.6 /usr/lib/x86_64-linux-gnu/libXt.so.6
 COPY --from=ksbase /usr/lib/x86_64-linux-gnu/libSM.so.6 /usr/lib/x86_64-linux-gnu/libSM.so.6
 COPY --from=ksbase /usr/lib/x86_64-linux-gnu/libICE.so.6 /usr/lib/x86_64-linux-gnu/libICE.so.6
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/matlabruntime/v911/runtime/glnxa64:/opt/matlabruntime/v911/bin/glnxa64:/opt/matlabruntime/v911/sys/os/glnxa64:/opt/matlabruntime/v911/sys/opengl/lib/glnxa64:/opt/matlabruntime/v911/extern/bin/glnxa64
-
-# Get Python stdout logs
-ENV PYTHONUNBUFFERED=1
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/matlabruntime/R2022b/runtime/glnxa64:/opt/matlabruntime/R2022b/bin/glnxa64:/opt/matlabruntime/R2022b/sys/os/glnxa64:/opt/matlabruntime/R2022b/sys/opengl/lib/glnxa64:/opt/matlabruntime/R2022b/extern/bin/glnxa64
 
 # Copy requirements and script
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 
 WORKDIR /app
-COPY run_script.py .
+COPY main.py .
+COPY utils.py .
 COPY light_server.py .
 RUN mkdir /data
 RUN mkdir /logs
 
-CMD ["python", "light_server.py"]
-# ENTRYPOINT ["python", "run_script.py"]
+# Get Python stdout logs
+ENV PYTHONUNBUFFERED=1
+
+CMD ["python", "light_server.py"]
diff --git a/containers/Dockerfile_simple b/containers/Dockerfile_simple
diff --git a/containers/README.md b/containers/README.md
@@ -94,7 +94,7 @@ If having difficulties pushing the image to ECR:
 4. Create a Job Definition (EC2)
     - Choose suitable Execution Timeout, Job Attempts and Retry Strategies
     - Select the base image
-    - Command = `python run_script.py`
+    - Command = `python main.py`
     - For the Execution role and Job role configuration, choose the `BatchJobsAccessRole`
     - Configure the resource requirements. Remember to choose a value for Memory slightly smaller than the value for the machines you're hoping to use, otherwise ECS might not find suitable instances.
     - Add any fixed ENV variables that should be used by any Jobs using this definition

diff --git a/containers/batch_client.py → containers/examples/batch_client.py b/containers/batch_client.py → containers/examples/batch_client.py
diff --git a/containers/light_server.py b/containers/light_server.py
@@ -3,7 +3,7 @@
 import logging
 import functools
 
-from run_script import main
+from main import main
 
 
 app = Flask(__name__)

diff --git a/containers/run_script.py → containers/main.py b/containers/run_script.py → containers/main.py
@@ -1,11 +1,6 @@
 import boto3
-import botocore
 import os
 import ast
-import shutil
-import requests
-import logging
-import sys
 import subprocess
 from warnings import filterwarnings
 from datetime import datetime
@@ -21,147 +16,13 @@
 from dandi.upload import upload
 from dandi.download import download
 
-
-# TODO - complete with more data types
-DATA_TYPE_TO_READER = {
-    "spikeglx": se.read_spikeglx,
-    "nwb": se.read_nwb_recording,
-}
-
-# # TODO - create data models for inputs of each data type reader
-# DATA_TYPE_READER_DATA_MODELS = {
-#     "spikeglx": ,
-#     "nwb": ,
-# }
-
-# # TODO - complete with more sorters
-# SORTER_DATA_MODELS = {
-#     "kilosort3": ,
-#     "kilosort2_5":,
-# }
-
-
-class Tee(object):
-    def __init__(self, *files):
-        self.files = files
-    def write(self, obj):
-        for f in self.files:
-            f.write(obj)
-            f.flush()
-    def flush(self) :
-        for f in self.files:
-            f.flush()
-
-
-def make_logger(run_identifier: str, log_to_file: bool):
-    logging.basicConfig()
-    logger = logging.getLogger("sorting_worker")
-    logger.handlers.clear()
-    logger.setLevel(logging.DEBUG)
-    log_formatter = logging.Formatter(
-        fmt="%(asctime)s %(levelname)s %(name)s -- %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-
-    if log_to_file:
-        # Add a logging handler for the log file
-        fileHandler = logging.FileHandler(
-            filename=f"/logs/sorting_worker_{run_identifier}.log",
-            mode="a",
-        )
-        fileHandler.setFormatter(log_formatter)
-        fileHandler.setLevel(level=logging.DEBUG)
-        logger.addHandler(fileHandler)
-
-        # Add a logging handler for stdout
-        stdoutHandler = logging.StreamHandler(sys.stdout)
-        stdoutHandler.setLevel(logging.DEBUG)
-        stdoutHandler.setFormatter(log_formatter)
-        logger.addHandler(stdoutHandler)
-
-        # Redirect stdout to a file-like object that writes to both stdout and the log file
-        stdout_log_file = open(f"/logs/sorting_worker_{run_identifier}.log", "a")
-        sys.stdout = Tee(sys.stdout, stdout_log_file)
-    else:
-        # Handler to print to console as well
-        handler = logging.StreamHandler(sys.stdout)
-        handler.setLevel(logging.DEBUG)
-        handler.setFormatter(log_formatter)
-        logger.addHandler(handler)
-    return logger
-
-
-def download_file_from_url(url):
-    # ref: https://stackoverflow.com/a/39217788/11483674
-    local_filename = "/data/filename.nwb"
-    with requests.get(url, stream=True) as r:
-        with open(local_filename, 'wb') as f:
-            shutil.copyfileobj(r.raw, f)
-
-
-def download_file_from_s3(
-    client:botocore.client.BaseClient, 
-    bucket_name:str, 
-    file_path:str
-):    
-    file_name = file_path.split("/")[-1]
-    client.download_file(
-        Bucket=bucket_name, 
-        Key=file_path, 
-        Filename=f"/data/{file_name}"
-    )
-    return file_name       
-
-
-def download_all_files_from_bucket_folder(
-    client:botocore.client.BaseClient, 
-    bucket_name:str, 
-    bucket_folder:str
-):    
-    # List files in folder, download all files with content
-    res = client.list_objects_v2(Bucket=bucket_name, Prefix=bucket_folder)
-    for f in res["Contents"]:
-        if f["Size"] > 0:
-            file_name = f["Key"].split("/")[-1]
-            client.download_file(
-                Bucket=bucket_name, 
-                Key=f["Key"], 
-                Filename=f"/data/{file_name}"
-            )
-
-
-def upload_file_to_bucket(
-    logger:logging.Logger,
-    client:botocore.client.BaseClient, 
-    bucket_name:str, 
-    bucket_folder:str,
-    local_file_path:str
-):
-    # Upload file to S3
-    logger.info(f"Uploading {local_file_path}...")
-    client.upload_file(
-        Filename=local_file_path,
-        Bucket=bucket_name,
-        Key=f"{bucket_folder}/{local_file_path}",
-    )
-
-
-def upload_all_files_to_bucket_folder(
-    logger:logging.Logger,
-    client:botocore.client.BaseClient, 
-    bucket_name:str, 
-    bucket_folder:str,
-    local_folder:str
-):
-    # List files from results, upload them to S3
-    files_list = [f for f in Path(local_folder).rglob("*") if f.is_file()]
-    for f in files_list:
-        logger.info(f"Uploading {str(f)}...")
-        client.upload_file(
-            Filename=str(f),
-            Bucket=bucket_name,
-            Key=f"{bucket_folder}{str(f)}",
-        )
+from utils import (
+    make_logger,
+    download_file_from_s3,
+    upload_file_to_bucket,
+    upload_all_files_to_bucket_folder,
+    download_file_from_url,
+)
 
 
 def main(
@@ -210,8 +71,9 @@ def main(
     - AWS_ACCESS_KEY_ID
     - AWS_SECRET_ACCESS_KEY
 
-    If saving results to DANDI archive, or reading from embargoed dandisets, the following ENV variable should be present in the running container:
+    If saving results to DANDI archive, or reading from embargoed dandisets, the following ENV variables should be present in the running container:
     - DANDI_API_KEY
+    - DANDI_API_KEY_STAGING
     """
 
     # Order of priority for definition of running arguments:
@@ -507,13 +369,17 @@ def main(
         # Upload results to DANDI
         logger.info(f"Uploading results to DANDI: {output_path}")
         dandi_instance = "dandi-staging" if "staging" in output_path else "dandi"
-        upload(
-            paths=[str(dandiset_local_full_path)],
-            existing="refresh",
-            validation="require",
-            dandi_instance=dandi_instance,
-            sync=True,
-        )
+        if dandi_instance == "dandi-staging":
+            DANDI_API_KEY = os.environ.get("DANDI_API_KEY_STAGING", None)
+            if DANDI_API_KEY is None:
+                raise Exception("DANDI_API_KEY_STAGING not found in ENV variables. Cannot upload results to DANDI staging.")
+        # upload(
+        #     paths=[str(dandiset_local_full_path)],
+        #     existing="refresh",
+        #     validation="require",
+        #     dandi_instance=dandi_instance,
+        #     sync=True,
+        # )
     else:
         # Upload results to local - already done by mounted volume
         pass