Skip to content

Commit

Permalink
Be more explicit on how to use tar datasets with DataLoader in our ex…
Browse files Browse the repository at this point in the history
…ample (#103)

Co-authored-by: Simon Beal <[email protected]>
  • Loading branch information
muddyfish and muddyfish authored Jan 11, 2024
1 parent cf4af72 commit d2e5e9d
Showing 1 changed file with 10 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -643,7 +643,7 @@
{
"data": {
"text/plain": [
"'2022.001.0000.00.DISC.01.jpg.image.jpg'"
"['2022.001.0000.00.DISC.01.jpg.image.jpg']"
]
},
"metadata": {},
Expand All @@ -663,7 +663,7 @@
{
"data": {
"text/plain": [
"'2022.001.0320.00.DISC.01.jpg.image.jpg'"
"['2022.001.0320.00.DISC.01.jpg.image.jpg']"
]
},
"metadata": {},
Expand All @@ -683,7 +683,7 @@
{
"data": {
"text/plain": [
"'2022.001.0640.00.DISC.01.jpg.image.jpg'"
"['2022.001.0640.00.DISC.01.jpg.image.jpg']"
]
},
"metadata": {},
Expand Down Expand Up @@ -745,12 +745,15 @@
"def shard_to_dict(object):\n",
" return {\"url\": object.key, \"stream\": object}\n",
"\n",
"dataset = s3torchconnector.S3IterableDataset.from_prefix(SHARDS_URI, region=REGION, transform=shard_to_dict)\n",
"dataset = webdataset.tariterators.tar_file_expander(dataset)\n",
"s3_dataset = s3torchconnector.S3IterableDataset.from_prefix(SHARDS_URI, region=REGION, transform=shard_to_dict)\n",
"tar_dataset = webdataset.tariterators.tar_file_expander(s3_dataset)\n",
"dataset = torchdata.datapipes.iter.IterableWrapper(tar_dataset, deepcopy=False)\n",
"\n",
"for sample in itertools.islice(dataset, 0, 100, 20):\n",
"loader = torch.utils.data.DataLoader(dataset)\n",
"\n",
"for sample in itertools.islice(loader, 0, 100, 20):\n",
" display(sample[\"fname\"])\n",
" display(Image.open(io.BytesIO(sample[\"data\"])).reduce(8))"
" display(Image.open(io.BytesIO(sample[\"data\"][0])).reduce(8))"
]
},
{
Expand Down

0 comments on commit d2e5e9d

Please sign in to comment.