You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When running the 'run_prep_artifacts.sh' script for 'es' there is an error when getting the wikipedia dataset. Hugginface does not have a prebuilt dataset for spanish and when line 53 fails in wikipedia_downloader.py and enters the exception, "beam_runner" doesnt seem to be a valid parameter.
If I comment that out, and put a valid date (20230801 is not a valid date anymore):
except Exception as _:
# if that fails, load from original huggingface dataset and process
ds_iterator = load_dataset(
"wikipedia", language=self._lang, date="20240320",
cache_dir=self._cache_dir, #beam_runner="DirectRunner",
split="train"
)
logger.info(f"{str(self)} Load {self._lang}-wiki from 20240320")
I get an error like this:
Traceback (most recent call last): | 0.00/7.40k [00:00<?, ?B/s]
File "/usr/local/lib/python3.11/site-packages/fsspec/implementations/http.py", line 419, in _info
await _file_info(
File "/usr/local/lib/python3.11/site-packages/fsspec/implementations/http.py", line 832, in _file_info
r.raise_for_status()
File "/usr/local/lib/python3.11/site-packages/aiohttp/client_reqrep.py", line 1060, in raise_for_status
raise ClientResponseError(
aiohttp.client_exceptions.ClientResponseError: 404, message='Not Found', url=URL('https://dumps.wikimedia.org/eswiki/20220301/dumpstatus.json')
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/app/src/artifacts/downloaders/wikipedia_downloader.py", line 53, in run
ds_iterator = load_dataset(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/load.py", line 2575, in load_dataset
return builder_instance.as_streaming_dataset(split=split)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/builder.py", line 1382, in as_streaming_dataset
splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/datasets_modules/datasets/wikipedia/d41137e149b2ea90eead07e7e3f805119a8c22dd1d5b61651af8e3e3ee736001/wikipedia.py", line 977, in _split_generators
with open(downloaded_files["info"], encoding="utf-8") as f:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/streaming.py", line 75, in wrapper
return function(*args, download_config=download_config, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/download/streaming_download_manager.py", line 512, in xopen
file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/core.py", line 135, in open
return self.__enter__()
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/core.py", line 103, in __enter__
f = self.fs.open(self.path, mode=mode)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/spec.py", line 1293, in open
f = self._open(
^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/implementations/http.py", line 358, in _open
size = size or self.info(path, **kwargs)["size"]
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/asyn.py", line 118, in wrapper
return sync(self.loop, func, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/asyn.py", line 103, in sync
raise return_result
File "/usr/local/lib/python3.11/site-packages/fsspec/asyn.py", line 56, in _runner
result[0] = await coro
^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/implementations/http.py", line 432, in _info
raise FileNotFoundError(url) from exc
FileNotFoundError: https://dumps.wikimedia.org/eswiki/20220301/dumpstatus.json
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/app/src/prep_artifacts.py", line 186, in <module>
main(artifacts_dir=args.artifacts_dir,
File "/usr/app/src/prep_artifacts.py", line 122, in main
wikipedia.run(logger=logger)
File "/usr/app/src/artifacts/downloaders/wikipedia_downloader.py", line 60, in run
ds_iterator = load_dataset(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/load.py", line 2582, in load_dataset
builder_instance.download_and_prepare(
File "/usr/local/lib/python3.11/site-packages/datasets/builder.py", line 1005, in download_and_prepare
self._download_and_prepare(
File "/usr/local/lib/python3.11/site-packages/datasets/builder.py", line 1078, in _download_and_prepare
split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/datasets_modules/datasets/wikipedia/d41137e149b2ea90eead07e7e3f805119a8c22dd1d5b61651af8e3e3ee736001/wikipedia.py", line 981, in _split_generators
multistream_dump_info["status"] == "done"
AssertionError: Specified dump (https://dumps.wikimedia.org/eswiki/20240320/) multistream status is not 'done': waiting
The text was updated successfully, but these errors were encountered:
When running the 'run_prep_artifacts.sh' script for 'es' there is an error when getting the wikipedia dataset. Hugginface does not have a prebuilt dataset for spanish and when line 53 fails in wikipedia_downloader.py and enters the exception, "beam_runner" doesnt seem to be a valid parameter.
If I comment that out, and put a valid date (20230801 is not a valid date anymore):
I get an error like this:
The text was updated successfully, but these errors were encountered: