[spark] Fixes python tarslip security concern (#2995)

* [spark] Fixes python tarslip security concern * reformat python code
deepjavalibrary · Feb 16, 2024 · 1fcca33 · 1fcca33
1 parent d825cf7
commit 1fcca33
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 70 deletions.
diff --git a/basicdataset/src/main/resources/imagenet/extract_imagenet.py b/basicdataset/src/main/resources/imagenet/extract_imagenet.py
@@ -14,6 +14,7 @@
 _VAL_TAR = 'ILSVRC2012_img_val.tar'
 _VAL_TAR_SHA1 = '5f3f73da3395154b60528b2b2a2caf2374f5f178'
 
+
 def download(url, path=None, overwrite=False, sha1_hash=None):
     """Download an given URL
     Parameters
@@ -42,26 +43,29 @@ def download(url, path=None, overwrite=False, sha1_hash=None):
         else:
             fname = path
 
-    if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
+    if overwrite or not os.path.exists(fname) or (
+            sha1_hash and not check_sha1(fname, sha1_hash)):
         dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
         if not os.path.exists(dirname):
             os.makedirs(dirname)
 
-        print('Downloading %s from %s...'%(fname, url))
+        print('Downloading %s from %s...' % (fname, url))
         r = requests.get(url, stream=True)
         if r.status_code != 200:
-            raise RuntimeError("Failed downloading url %s"%url)
+            raise RuntimeError("Failed downloading url %s" % url)
         total_length = r.headers.get('content-length')
         with open(fname, 'wb') as f:
-            if total_length is None: # no content length header
+            if total_length is None:  # no content length header
                 for chunk in r.iter_content(chunk_size=1024):
-                    if chunk: # filter out keep-alive new chunks
+                    if chunk:  # filter out keep-alive new chunks
                         f.write(chunk)
             else:
                 total_length = int(total_length)
                 for chunk in tqdm(r.iter_content(chunk_size=1024),
                                   total=int(total_length / 1024. + 0.5),
-                                  unit='KB', unit_scale=False, dynamic_ncols=True):
+                                  unit='KB',
+                                  unit_scale=False,
+                                  dynamic_ncols=True):
                     f.write(chunk)
 
         if sha1_hash and not check_sha1(fname, sha1_hash):
@@ -72,25 +76,34 @@ def download(url, path=None, overwrite=False, sha1_hash=None):
 
     return fname
 
+
 def parse_args():
     parser = argparse.ArgumentParser(
         description='Setup the ImageNet dataset.',
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--download-dir', required=True,
-                        help="The directory that contains downloaded tar files")
+    parser.add_argument(
+        '--download-dir',
+        required=True,
+        help="The directory that contains downloaded tar files")
     parser.add_argument('--target-dir',
                         help="The directory to store extracted images")
-    parser.add_argument('--checksum', action='store_true',
+    parser.add_argument('--checksum',
+                        action='store_true',
                         help="If check integrity before extracting.")
-    parser.add_argument('--with-rec', action='store_true',
+    parser.add_argument('--with-rec',
+                        action='store_true',
                         help="If build image record files.")
-    parser.add_argument('--num-thread', type=int, default=1,
-                        help="Number of threads to use when building image record file.")
+    parser.add_argument(
+        '--num-thread',
+        type=int,
+        default=1,
+        help="Number of threads to use when building image record file.")
     args = parser.parse_args()
     if args.target_dir is None:
         args.target_dir = args.download_dir
     return args
 
+
 def check_sha1(filename, sha1_hash):
     """Check whether the sha1 hash of the file content matches the expected hash.
 
@@ -116,11 +129,13 @@ def check_sha1(filename, sha1_hash):
 
     return sha1.hexdigest() == sha1_hash
 
+
 def check_file(filename, checksum, sha1):
     if not os.path.exists(filename):
-        raise ValueError('File not found: '+filename)
+        raise ValueError('File not found: ' + filename)
     if checksum and not check_sha1(filename, sha1):
-        raise ValueError('Corrupted file: '+filename)
+        raise ValueError('Corrupted file: ' + filename)
+
 
 def build_rec_process(img_dir, train=False, num_thread=1):
     rec_dir = os.path.abspath(os.path.join(img_dir, '../rec'))
@@ -141,102 +156,84 @@ def build_rec_process(img_dir, train=False, num_thread=1):
     # execution
     import sys
     cmd = [
-        sys.executable,
-        script_path,
-        rec_dir,
-        img_dir,
-        '--recursive',
-        '--pass-through',
-        '--pack-label',
-        '--num-thread',
+        sys.executable, script_path, rec_dir, img_dir, '--recursive',
+        '--pass-through', '--pack-label', '--num-thread',
         str(num_thread)
     ]
     subprocess.call(cmd)
     os.remove(script_path)
     os.remove(lst_path)
     print('ImageRecord file for ' + prefix + ' has been built!')
 
+
+def is_within_directory(directory, target):
+    abs_directory = os.path.abspath(directory)
+    abs_target = os.path.abspath(target)
+    prefix = os.path.commonprefix([abs_directory, abs_target])
+    return prefix == abs_directory
+
+
+def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+    for member in tar.getmembers():
+        member_path = os.path.join(path, member.name)
+        if not is_within_directory(path, member_path):
+            raise Exception("Attempted Path Traversal in Tar File")
+    tar.extractall(path, members, numeric_owner=numeric_owner)
+
+
 def extract_train(tar_fname, target_dir, with_rec=False, num_thread=1):
     os.makedirs(target_dir)
     with tarfile.open(tar_fname) as tar:
-        print("Extracting "+tar_fname+"...")
+        print("Extracting " + tar_fname + "...")
         # extract each class one-by-one
         pbar = tqdm(total=len(tar.getnames()))
         for class_tar in tar:
-            pbar.set_description('Extract '+class_tar.name)
-            tar.extract(class_tar, target_dir)
+            pbar.set_description('Extract ' + class_tar.name)
             class_fname = os.path.join(target_dir, class_tar.name)
+            if not is_within_directory(target_dir, class_fname):
+                raise Exception("Attempted Path Traversal in Tar File")
+
+            tar.extract(class_tar, target_dir)
             class_dir = os.path.splitext(class_fname)[0]
             os.mkdir(class_dir)
             with tarfile.open(class_fname) as f:
-                def is_within_directory(directory, target):
-
-                    abs_directory = os.path.abspath(directory)
-                    abs_target = os.path.abspath(target)
-
-                    prefix = os.path.commonprefix([abs_directory, abs_target])
-
-                    return prefix == abs_directory
-
-                def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
-
-                    for member in tar.getmembers():
-                        member_path = os.path.join(path, member.name)
-                        if not is_within_directory(path, member_path):
-                            raise Exception("Attempted Path Traversal in Tar File")
-
-                    tar.extractall(path, members, numeric_owner=numeric_owner) 
-
-
                 safe_extract(f, class_dir)
+
             os.remove(class_fname)
             pbar.update(1)
         pbar.close()
     if with_rec:
         build_rec_process(target_dir, True, num_thread)
 
+
 def extract_val(tar_fname, target_dir, with_rec=False, num_thread=1):
     os.makedirs(target_dir)
     print('Extracting ' + tar_fname)
     with tarfile.open(tar_fname) as tar:
-        def is_within_directory(directory, target):
-
-            abs_directory = os.path.abspath(directory)
-            abs_target = os.path.abspath(target)
-
-            prefix = os.path.commonprefix([abs_directory, abs_target])
-
-            return prefix == abs_directory
-
-        def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
-
-            for member in tar.getmembers():
-                member_path = os.path.join(path, member.name)
-                if not is_within_directory(path, member_path):
-                    raise Exception("Attempted Path Traversal in Tar File")
-
-            tar.extractall(path, members, numeric_owner=numeric_owner) 
-
-
         safe_extract(tar, target_dir)
+
     # build rec file before images are moved into subfolders
     if with_rec:
         build_rec_process(target_dir, False, num_thread)
     # move images to proper subfolders
-    val_maps_file = os.path.join(os.path.dirname(__file__), 'imagenet_val_maps.pklz')
+    val_maps_file = os.path.join(os.path.dirname(__file__),
+                                 'imagenet_val_maps.pklz')
     with gzip.open(val_maps_file, 'rb') as f:
         dirs, mappings = pickle.load(f)
     for d in dirs:
         os.makedirs(os.path.join(target_dir, d))
     for m in mappings:
-        os.rename(os.path.join(target_dir, m[0]), os.path.join(target_dir, m[1], m[0]))
+        os.rename(os.path.join(target_dir, m[0]),
+                  os.path.join(target_dir, m[1], m[0]))
+
 
 def main():
     args = parse_args()
 
     target_dir = os.path.expanduser(args.target_dir)
     if os.path.exists(target_dir):
-        raise ValueError('Target dir ['+target_dir+'] exists. Remove it first')
+        raise ValueError('Target dir [' + target_dir +
+                         '] exists. Remove it first')
 
     download_dir = os.path.expanduser(args.download_dir)
     train_tar_fname = os.path.join(download_dir, _TRAIN_TAR)
@@ -247,8 +244,11 @@ def main():
     build_rec = args.with_rec
     if build_rec:
         os.makedirs(os.path.join(target_dir, 'rec'))
-    extract_train(train_tar_fname, os.path.join(target_dir, 'train'), build_rec, args.num_thread)
-    extract_val(val_tar_fname, os.path.join(target_dir, 'val'), build_rec, args.num_thread)
+    extract_train(train_tar_fname, os.path.join(target_dir, 'train'),
+                  build_rec, args.num_thread)
+    extract_val(val_tar_fname, os.path.join(target_dir, 'val'), build_rec,
+                args.num_thread)
+
 
 if __name__ == '__main__':
     main()
diff --git a/extensions/spark/setup/djl_spark/util/files_util.py b/extensions/spark/setup/djl_spark/util/files_util.py
@@ -70,6 +70,21 @@ def download_and_extract(url, path):
     :param url: The url of the tar file.
     :param path: The path to the file to download to.
     """
+
+    def is_within_directory(directory, target):
+        abs_directory = os.path.abspath(directory)
+        abs_target = os.path.abspath(target)
+        prefix = os.path.commonprefix([abs_directory, abs_target])
+        return prefix == abs_directory
+
+    def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+        for member in tar.getmembers():
+            member_path = os.path.join(path, member.name)
+            if not is_within_directory(path, member_path):
+                raise Exception("Attempted Path Traversal in Tar File")
+
+        tar.extractall(path, members, numeric_owner=numeric_owner)
+
     if not os.path.exists(path):
         os.makedirs(path)
     if not os.listdir(path):
@@ -78,9 +93,9 @@ def download_and_extract(url, path):
             if url.startswith("s3://"):
                 s3_download(url, tmp_file)
                 with tarfile.open(name=tmp_file, mode="r:gz") as t:
-                    t.extractall(path=path)
+                    safe_extract(t, path=path)
             elif url.startswith("http://") or url.startswith("https://"):
                 with urlopen(url) as response, open(tmp_file, 'wb') as f:
                     shutil.copyfileobj(response, f)
                 with tarfile.open(name=tmp_file, mode="r:gz") as t:
-                    t.extractall(path=path)
+                    safe_extract(t, path=path)