Skip to content

Commit

Permalink
fix broken download of auxilliary files
Browse files Browse the repository at this point in the history
  • Loading branch information
apdavison committed Aug 29, 2023
1 parent 340a42a commit d5525f2
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 120 deletions.
142 changes: 83 additions & 59 deletions api/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from urllib.parse import urlparse, urlunparse
from fastapi import HTTPException, status
import neo.io
import quantities as pq

from . import settings

Expand All @@ -32,7 +33,7 @@ def get_base_url_and_path(url):
"",
)
)
return base_url, url_parts.path
return base_url, os.path.basename(url_parts.path)


def get_cache_path(url):
Expand All @@ -41,67 +42,80 @@ def get_cache_path(url):
based on the URL, but files in the same directory on the original server end up in the
same directory in our cache.
"""
base_url, base_path = get_base_url_and_path(url)
base_url, filename = get_base_url_and_path(url)
dir_name = hashlib.sha1(base_url.encode("utf-8")).hexdigest()
dir_path = os.path.join(
getattr(settings, "DOWNLOADED_FILE_CACHE_DIR", ""), dir_name
)
os.makedirs(dir_path, exist_ok=True)
return os.path.join(dir_path, os.path.basename(base_path))


def list_files_to_download(io_cls, resolved_url, main_path):
file_list = [(resolved_url, main_path, True)]
io_mode = getattr(io_cls, "rawmode", None)
if io_mode == "one-dir":
# In general, we don't know the names of the individual files
# and have no way to get a directory listing from a URL
# so we raise an exception
if io_cls.__name__ in ("PhyIO"):
# for the exceptions, resolved_url must represent a directory
raise NotImplementedError # todo: for these ios, the file names are known
else:
return dir_path, filename


def list_files_to_download(resolved_url, cache_dir, io_cls=None):
base_url, main_file = get_base_url_and_path(resolved_url)
file_list = [(resolved_url, os.path.join(cache_dir, main_file), True)]
if io_cls:
root_path, ext = os.path.splitext(main_file)
io_mode = getattr(io_cls, "rawmode", None)
if io_mode == "one-dir":
# In general, we don't know the names of the individual files
# and have no way to get a directory listing from a URL
# so we raise an exception
if io_cls.__name__ in ("PhyIO"):
# for the exceptions, resolved_url must represent a directory
raise NotImplementedError # todo: for these ios, the file names are known
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=(
"Cannot download files from a URL representing a directory. "
"Please provide the URL of a zip or tar archive of the directory."
)
)
elif io_mode == "multi-file":
# Here the resolved_url represents a single file, with or without the file extension.
# By taking the base/root path and adding various extensions we get a list of files to download
for extension in io_cls.extensions:
file_list.append(
# Neo doesn't tell us which files are required and which are optional
# so we have to treat them all as optional at this stage
(f"{base_url}/{root_path}.{extension}", f"{cache_dir}/{root_path}.{extension}", False)
)
elif io_cls.__name__ == "BrainVisionIO":
# in should io_mode be "multi-file" for this? currently "one-file"
for extension in ("eeg", "vmrk"):
file_list.append(
(f"{base_url}/{root_path}.{extension}", f"{cache_dir}/{root_path}.{extension}", True)
)
elif io_cls.__name__ == "ElanIO":
for extension in ("eeg.ent", "eeg.pos"):
file_list.append(
(f"{base_url}/{root_path}.{extension}", f"{cache_dir}/{root_path}.{extension}", True)
)
elif io_mode == "one-file":
# Here the resolved url should represent a single file,
# which could have different possible extensions
# todo: check the URL extension matches one of the possible extensions
# and raise an exception otherwise
pass
elif io_cls.mode == "dir":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=(
"Cannot download files from a URL representing a directory. "
"Please provide the URL of a zip or tar archive of the directory."
)
)
elif io_mode == "multi-file":
# Here the resolved_url represents a single file, with or without the file extension.
# By taking the base/root path and adding various extensions we get a list of files to download
base_url, base_path = get_base_url_and_path(resolved_url)
for extension in io_cls.extensions:
file_list.append(
# Neo doesn't tell us which files are required and which are optional
# so we have to treat them all as optional at this stage
(f"{base_url}.{extension}", f"{base_path}.{extension}", False)
)
elif io_mode == "one-file":
# Here the resolved url should represent a single file,
# which could have different possible extensions
# todo: check the URL extension matches one of the possible extensions
# and raise an exception otherwise
pass
elif io_cls.mode == "dir":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=(
"Cannot download files from a URL representing a directory. "
"Please provide the URL of a zip or tar archive of the directory."
)
)
else:
# we assume the resolved url represents a single file
# certain IOs have additional metadata files
if io_cls.__name__ == "AsciiSignalIO":
# if we have a text file, try to download the accompanying json file
name, ext = os.path.splitext(main_path)
if ext[1:] in neo.io.AsciiSignalIO.extensions: # ext has a leading '.'
metadata_filename = main_path.replace(ext, "_about.json")
metadata_url = resolved_url.replace(ext, "_about.json")
file_list.append((metadata_url, metadata_filename, False))
else:
# we assume the resolved url represents a single file
# certain IOs have additional metadata files
if io_cls.__name__ == "AsciiSignalIO":
# if we have a text file, try to download the accompanying json file
name, ext = os.path.splitext(main_file)
if ext[1:] in neo.io.AsciiSignalIO.extensions: # ext has a leading '.'
metadata_filename = main_file.replace(ext, "_about.json")
metadata_url = resolved_url.replace(ext, "_about.json")
file_list.append((metadata_url, f"{cache_dir}/{metadata_filename}", False))
return file_list


Expand All @@ -123,13 +137,9 @@ def download_neo_data(url, io_cls=None):
)
resolved_url = response.geturl()

main_path = get_cache_path(resolved_url)
if not os.path.exists(main_path):
if io_cls:
files_to_download = list_files_to_download(io_cls, resolved_url, main_path)
else:
files_to_download = [(resolved_url, main_path, True)]

cache_dir, main_file = get_cache_path(resolved_url)
if not os.path.exists(os.path.join(cache_dir, main_file)):
files_to_download = list_files_to_download(resolved_url, cache_dir, io_cls)
for file_url, file_path, required in files_to_download:
try:
urlretrieve(file_url, file_path)
Expand All @@ -140,10 +150,19 @@ def download_neo_data(url, io_cls=None):
status_code=status.HTTP_404_NOT_FOUND, # maybe use 501 Not Implemented?
detail=f"Problem downloading '{file_url}'"
)

main_path = files_to_download[0][1]
else:
main_path = os.path.join(cache_dir, main_file)
return main_path



extra_kwargs = {
"NestIO": {
"gid_list": [], "t_start": 0 * pq.ms, "t_stop": 1e6 * pq.ms
}
}

def load_blocks(url, io_class_name=None):
"""
Load the first block from the data file at the given URL.
Expand All @@ -160,6 +179,8 @@ def load_blocks(url, io_class_name=None):
try:
if io_cls.mode == "dir":
io = io_cls(dirname=main_path)
elif io_cls.__name__ == "NestIO":
io = io_cls(filenames=main_path)
else:
io = io_cls(filename=main_path)
except ImportError:
Expand All @@ -186,10 +207,13 @@ def load_blocks(url, io_class_name=None):
if io.support_lazy:
blocks = io.read(lazy=True)
else:
blocks = io.read()
kwargs = extra_kwargs.get(io.__class__.__name__, {})
blocks = io.read(**kwargs)
except (AssertionError, ValueError, IndexError, KeyError, AttributeError) as err:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f'Error when trying to open file with {io.__class__.__name__}: "{err}"',
)
if hasattr(io, "close"):
io.close()
return blocks
48 changes: 48 additions & 0 deletions api/test/test_data_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
"""

import os.path
from neo.io import BrainVisionIO
from ..data_handler import get_base_url_and_path, get_cache_path, list_files_to_download


def test_get_base_url_and_path():
url = "https://gin.g-node.org/NeuralEnsemble/ephy_testing_data/raw/master/brainvision/File_brainvision_1.vhdr"
base_url, base_path = get_base_url_and_path(url)
assert (
base_url
== "https://gin.g-node.org/NeuralEnsemble/ephy_testing_data/raw/master/brainvision"
)
assert base_path == "File_brainvision_1.vhdr"

url = "https://gin.g-node.org/NeuralEnsemble/ephy_testing_data/src/master/maxwell/MaxOne_data/Record/000011/data.raw.h5"
base_url, base_path = get_base_url_and_path(url)
assert (
base_url
== "https://gin.g-node.org/NeuralEnsemble/ephy_testing_data/src/master/maxwell/MaxOne_data/Record/000011"
)
assert base_path == "data.raw.h5"


def test_get_cache_path():
url = "https://gin.g-node.org/NeuralEnsemble/ephy_testing_data/raw/master/brainvision/File_brainvision_1.vhdr"
cache_path, filename = get_cache_path(url)
assert cache_path == os.path.realpath(
os.path.join(
os.path.dirname(__file__),
"..",
"download_cache/603fed2393f75a3f294fceac99640f7d4a42f74d",
)
)
assert filename == "File_brainvision_1.vhdr"

def test_list_files_to_download():
url = "https://gin.g-node.org/NeuralEnsemble/ephy_testing_data/raw/master/brainvision/File_brainvision_1.vhdr"
files_to_download = list_files_to_download(url, "the_cache_dir", BrainVisionIO)
expected = [
(url, "the_cache_dir/File_brainvision_1.vhdr", True),
(url.replace(".vhdr", ".eeg"), "the_cache_dir/File_brainvision_1.eeg", True),
(url.replace(".vhdr", ".vmrk"), "the_cache_dir/File_brainvision_1.vmrk", True)
]
assert files_to_download == expected
Loading

0 comments on commit d5525f2

Please sign in to comment.