Skip to content

Commit

Permalink
Move code to python
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Jul 13, 2023
1 parent 19d1ebd commit 0afd759
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 98 deletions.
91 changes: 44 additions & 47 deletions docs/examples/data/torchvision/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ repository.
+# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
+# faster training
+srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+ time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS}
+ time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS}
# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
Expand Down Expand Up @@ -293,67 +293,64 @@ repository.
main()
**data.sh**
**data.py**

.. code:: bash
.. code:: python
#!/bin/bash
set -o errexit
"""Make sure the data is available"""
import os
import shutil
import sys
import time
from multiprocessing import Pool
from pathlib import Path
function ln_files {
# Clone the dataset structure of `src` to `dest` with symlinks and using
# `workers` numbre of workers (defaults to 4)
local src=$1
local dest=$2
local workers=${3:-4}
from torchvision.datasets import INaturalist
(cd "${src}" && find -L * -type f) | while read f
do
mkdir --parents "${dest}/$(dirname "$f")"
# echo source first so it is matched to the ln's '-T' argument
readlink --canonicalize "${src}/$f"
# echo output last so ln understands it's the output file
echo "${dest}/$f"
done | xargs -n2 -P${workers} ln --symbolic --force -T
}
_SRC=$1
_WORKERS=$2
# Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
# environment variable will only be resolved on the worker node (i.e. not
# referencing the $SLURM_TMPDIR of the master node)
_DEST=$SLURM_TMPDIR/data
def link_file(src:str, dest:str):
Path(src).symlink_to(dest)
ln_files "${_SRC}" "${_DEST}" ${_WORKERS}
# Reorganise the files if needed
(
cd "${_DEST}"
# Torchvision expects these names
mv train.tar.gz 2021_train.tgz
mv val.tar.gz 2021_valid.tgz
)
def link_files(src:str, dest:str, workers=4):
src = Path(src)
dest = Path(dest)
os.makedirs(dest, exist_ok=True)
with Pool(processes=workers) as pool:
for path, dnames, fnames in os.walk(str(src)):
rel_path = Path(path).relative_to(src)
fnames = map(lambda _f: rel_path / _f, fnames)
dnames = map(lambda _d: rel_path / _d, dnames)
for d in dnames:
os.makedirs(str(dest / d), exist_ok=True)
pool.starmap(
link_file,
[(src / _f, dest / _f) for _f in fnames]
)
# Extract and prepare the data
python3 data.py "${_DEST}"
if __name__ == "__main__":
src = Path(sys.argv[1])
workers = int(sys.argv[2])
# Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
# environment variable will only be resolved on the worker node (i.e. not
# referencing the $SLURM_TMPDIR of the master node)
dest = Path(os.environ["SLURM_TMPDIR"]) / "dest"
**data.py**
start_time = time.time()
.. code:: python
link_files(src, dest, workers)
"""Make sure the data is available"""
import sys
import time
# Torchvision expects these names
shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz")
shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz")
from torchvision.datasets import INaturalist
INaturalist(root=dest, version="2021_train", download=True)
INaturalist(root=dest, version="2021_valid", download=True)
seconds_spent = time.time() - start_time
start_time = time.time()
INaturalist(root=sys.argv[1], version="2021_train", download=True)
INaturalist(root=sys.argv[1], version="2021_valid", download=True)
seconds_spent = time.time() - start_time
print(f"Prepared data in {seconds_spent/60:.2f}m")
print(f"Prepared data in {seconds_spent/60:.2f}m")
**Running this example**
Expand Down
6 changes: 0 additions & 6 deletions docs/examples/data/torchvision/_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,6 @@ repository.
:language: diff


**data.sh**

.. literalinclude:: examples/data/torchvision/data.sh
:language: bash


**data.py**

.. literalinclude:: examples/data/torchvision/data.py
Expand Down
52 changes: 47 additions & 5 deletions docs/examples/data/torchvision/data.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,54 @@
"""Make sure the data is available"""
import os
import shutil
import sys
import time
from multiprocessing import Pool
from pathlib import Path

from torchvision.datasets import INaturalist


start_time = time.time()
INaturalist(root=sys.argv[1], version="2021_train", download=True)
INaturalist(root=sys.argv[1], version="2021_valid", download=True)
seconds_spent = time.time() - start_time
print(f"Prepared data in {seconds_spent/60:.2f}m")
def link_file(src:str, dest:str):
Path(src).symlink_to(dest)


def link_files(src:str, dest:str, workers=4):
src = Path(src)
dest = Path(dest)
os.makedirs(dest, exist_ok=True)
with Pool(processes=workers) as pool:
for path, dnames, fnames in os.walk(str(src)):
rel_path = Path(path).relative_to(src)
fnames = map(lambda _f: rel_path / _f, fnames)
dnames = map(lambda _d: rel_path / _d, dnames)
for d in dnames:
os.makedirs(str(dest / d), exist_ok=True)
pool.starmap(
link_file,
[(src / _f, dest / _f) for _f in fnames]
)


if __name__ == "__main__":
src = Path(sys.argv[1])
workers = int(sys.argv[2])
# Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
# environment variable will only be resolved on the worker node (i.e. not
# referencing the $SLURM_TMPDIR of the master node)
dest = Path(os.environ["SLURM_TMPDIR"]) / "dest"

start_time = time.time()

link_files(src, dest, workers)

# Torchvision expects these names
shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz")
shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz")

INaturalist(root=dest, version="2021_train", download=True)
INaturalist(root=dest, version="2021_valid", download=True)

seconds_spent = time.time() - start_time

print(f"Prepared data in {seconds_spent/60:.2f}m")
39 changes: 0 additions & 39 deletions docs/examples/data/torchvision/data.sh

This file was deleted.

2 changes: 1 addition & 1 deletion docs/examples/data/torchvision/job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ mkdir -p "$SLURM_TMPDIR/data"
# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
# faster training
srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS}
time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS}


# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
Expand Down

0 comments on commit 0afd759

Please sign in to comment.