fix: converted the tranksribus collection into jsonl and xml format f…

…or line segmentation
OpenPecha · Jul 3, 2024 · 0aa2c87 · 0aa2c87
1 parent 9bfa1f1
commit 0aa2c87
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 102 deletions.
diff --git a/src/format_line_segmentations/conversion.py b/src/format_line_segmentations/conversion.py
@@ -74,50 +74,6 @@ def create_directories(paths):
             if not os.path.exists(output_dir):
                 os.makedirs(output_dir)
 
-# Process Transkrisbus Data (XML) files for each directory
-def get_xml_paths(base_path, output_base_path):
-    input_dirs = []
-    output_jsonls = []
-    output_xmls = []
-    for root, dirs, files in os.walk(base_path):
-        if ('xml' in os.path.basename(root).lower() or 'page' in os.path.basename(root).lower()) and any(file.endswith(".xml") for file in files):
-            input_dirs.append(root)
-            relative_path = os.path.relpath(root, base_path)
-            jsonl_name = relative_path.replace(os.sep, '_') + '.jsonl'
-            xml_dir_name = relative_path.replace(os.sep, '_') + '_xml'
-            output_jsonls.append(os.path.join(output_base_path, jsonl_name))
-            output_xmls.append(os.path.join(output_base_path, xml_dir_name))
-    return input_dirs, output_jsonls, output_xmls
-
-#Process XML files for Transkribus data
-def process_xml_files(input_directories, output_files, output_directories, dataset_name):
-    for input_directory, output_file, output_directory in zip(input_directories, output_files, output_directories):
-        if not os.path.exists(output_directory):
-            os.makedirs(output_directory)
-        # Filter XML files excluding 'metadata.xml' and 'mets.xml'
-        image_files = {
-            os.path.splitext(f)[0]: os.path.join(input_directory, f)
-            for f in os.listdir(input_directory)
-            if f.lower().endswith(".xml") and not (f.lower().startswith("metadata") or f.lower() == "mets.xml")
-        }
-        with open(output_file, 'w', encoding='utf-8') as output_f:
-            for filename in os.listdir(input_directory):
-                if filename.lower().endswith(".xml") and not (filename.lower().startswith("metadata") or filename.lower() == "mets.xml"):
-                    file_id = os.path.splitext(filename)[0]
-                    image_file = image_files.get(file_id)
-                    if image_file:
-                        file_path = os.path.join(input_directory, filename)
-                        ocr_data = process_xml_file(file_path)
-                        image_metadata = extract_metadata_from_xml(ocr_data, image_file)
-                        if image_metadata:
-                            jsonl_output = convert_to_jsonl(ocr_data, image_metadata)
-                            output_f.write(jsonl_output + '\n')
-                            xml_root = convert_to_xml(ocr_data, image_metadata, dataset_name, "2024-06-10T11:08:30.326+00:00")
-                            xml_output = prettify_xml(xml_root)
-                            output_file_path = os.path.join(output_directory, f"{file_id}.xml")
-                            with open(output_file_path, 'w', encoding='utf-8') as output_xml:
-                                output_xml.write(xml_output)
-
 # Process Google Books Data (HTML) files
 def process_google_books_html_files(paths):
     image_files_html = {os.path.splitext(f)[0]: os.path.join(paths["google_books"]["input_images"], f)
@@ -140,27 +96,34 @@ def process_google_books_html_files(paths):
                         with open(output_file_path, 'w', encoding='utf-8') as output_file_google_books:
                             output_file_google_books.write(xml_output)
 
-# Process XML files for HTR team data
-def process_htr_teams_xml_files(paths):
-    image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_images"], f)
-                       for f in os.listdir(paths["aws"]["input_images"]) if f.lower().endswith(".jpg")}
-    with open(paths["aws"]["output_jsonl"], 'w', encoding='utf-8') as output_1:
-        for filename in os.listdir(paths["aws"]["input_xml"]):
-            if filename.endswith(".xml"):
-                file_id = os.path.splitext(filename)[0]
-                image_file_1 = image_files_xml.get(file_id)
-                if image_file_1:
-                    file_path = os.path.join(paths["aws"]["input_xml"], filename)
-                    ocr_data = process_xml_file(file_path)
-                    image_metadata_1 = extract_metadata_from_xml(ocr_data, image_file_1)
-                    if ocr_data and image_metadata_1:
-                        jsonl_output = convert_to_jsonl(ocr_data, image_metadata_1)
-                        output_1.write(jsonl_output + '\n')
-                        xml_root = convert_to_xml(ocr_data, image_metadata_1, "HTR Team")
-                        xml_output = prettify_xml(xml_root)
-                        output_file_path = os.path.join(paths["aws"]["output_xml"], f"{file_id}.xml")
-                        with open(output_file_path, 'w', encoding='utf-8') as output_file_aws:
-                            output_file_aws.write(xml_output)  
+# Process XML files for Transkribus and HTR team data
+def process_xml_data(paths):
+    for dataset, dataset_paths in paths.items():
+        if dataset == "transkribus":
+            image_files = {os.path.splitext(f)[0]: os.path.join(dataset_paths["input_images"], f)
+                           for f in os.listdir(dataset_paths["input_images"]) if f.lower().endswith(".jpg")}
+        elif dataset == "aws":
+            image_files = {os.path.splitext(f)[0]: os.path.join(dataset_paths["input_images"], f)
+                           for f in os.listdir(dataset_paths["input_images"]) if f.lower().endswith(".jpg")}
+        else:
+            continue
+        with open(dataset_paths["output_jsonl"], 'w', encoding='utf-8') as output_jsonl:
+            for filename in os.listdir(dataset_paths["input_xml_base"]):
+                if filename.endswith(".xml"):
+                    file_id = os.path.splitext(filename)[0]
+                    image_file = image_files.get(file_id)
+                    if image_file:
+                        file_path = os.path.join(dataset_paths["input_xml_base"], filename)
+                        ocr_data = process_xml_file(file_path)
+                        image_metadata = extract_metadata_from_xml(ocr_data, image_file)
+                        if ocr_data and image_metadata:
+                            jsonl_output = convert_to_jsonl(ocr_data, image_metadata)
+                            output_jsonl.write(jsonl_output + '\n')
+                            xml_root = convert_to_xml(ocr_data, image_metadata, dataset.capitalize())
+                            xml_output = prettify_xml(xml_root)
+                            output_file_path = os.path.join(dataset_paths["output_xml"], f"{file_id}.xml")
+                            with open(output_file_path, 'w', encoding='utf-8') as output_xml:
+                                output_xml.write(xml_output)
 
 # Main function to process HTML and XML files and convert them to JSONL and XML formats.
 def main():
@@ -178,37 +141,19 @@ def main():
             "input_images": f"{base_path}google_books/google_books_images_folder/",
             "output_jsonl": f"{output_base_path}google_books_data.jsonl",
             "output_xml": f"{output_base_path}google_books_data_xml/"
-        },
+        }, 
         "transkribus": {
-            "stok_kangyur": {
-                "input_xml_base": f"{base_path}transkrisbus/stok_kangyur/"
-            },
-            "phudrak": {
-                "input_xml_base": f"{base_path}transkrisbus/phudrak/"
-            },
-            "derge_kangyur": {
-                "input_xml_base": f"{base_path}transkrisbus/derge-kangyur/"
-            },
-            "tib_school": {
-                "input_xml_base": f"{base_path}transkrisbus/tib_school/"
-            }   
-        } 
+            "input_xml_base": f"{base_path}transkribus/",
+            "input_images": f"{base_path}transkribus_images_folder/",
+            "output_jsonl": f"{output_base_path}transkribus_data.jsonl",
+            "output_xml": f"{output_base_path}transkribus_data_xml/"
+        }
     }   
     create_directories(paths)
     # Process Html files for Google Books data
     process_google_books_html_files(paths)
-    transkribus_datasets = {
-        "Transkribus Stok Kangyur": paths["transkribus"]["stok_kangyur"]["input_xml_base"],
-        "Transkribus Phudrak": paths["transkribus"]["phudrak"]["input_xml_base"],
-        "Transkribus Derge Kangyur": paths["transkribus"]["derge_kangyur"]["input_xml_base"],
-        "Transkribus Derge Kangyur": paths["transkribus"]["tib_school"]["input_xml_base"]
-    }
-    for dataset_name, input_xml_base in transkribus_datasets.items():
-        input_xml, output_jsonl, output_xml = get_xml_paths(input_xml_base, output_base_path)
-        process_xml_files(input_xml, output_jsonl, output_xml, dataset_name) # Process XML files for Transkribus data
-    # Process XML files for HTR team data
-    process_htr_teams_xml_files(paths)
+    # Process XML files for Tanskribus and HTR team data
+    process_xml_data(paths)
 
-
 if __name__ == "__main__":
     main()
diff --git a/src/format_line_segmentations/extract_zip.py b/src/format_line_segmentations/extract_zip.py
@@ -1,20 +1,19 @@
 import os
 import zipfile
 
-
 def extract_zip(zip_path, extract_to):
     """
     Extracts a ZIP file to a specified location.
     """
     with zipfile.ZipFile(zip_path, "r") as zip_ref:
         zip_ref.extractall(extract_to)
 
-
 def find_and_extract_zip(root_path, output_path):
     """
     Recursively finds ZIP files in the given directory and subdirectories,
     and extracts them.
     """
+    count =0 
     for root, dirs, files in os.walk(root_path):
         for filename in files:
             if filename.endswith(".zip"):
@@ -33,7 +32,8 @@ def find_and_extract_zip(root_path, output_path):
                 find_and_extract_zip(
                     extract_to, extract_to
                 )  # Adjusted to use extract_to for both parameters
-
+                count=count+1
+    print(count)
 
 if __name__ == "__main__":
     root_path = "../../data/tib_school_zip"

diff --git a/src/format_line_segmentations/formatting_google_books_file.py b/src/format_line_segmentations/formatting_google_books_file.py
@@ -2,9 +2,9 @@
 import shutil
 from PIL import Image
 
-ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/extracted_data'
-HTML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_html_folder'
-IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_images_folder'
+ROOT_DIR = '../../data/extracted_data'
+HTML_DEST_DIR = '../../data/google_books_html_folder'
+IMAGES_DEST_DIR = '../../data/google_books_images_folder'
 
 def convert_tiff_to_jpg(src_path, dest_path):
     with Image.open(src_path) as img:
@@ -53,7 +53,6 @@ def main():
         os.makedirs(HTML_DEST_DIR)
     if not os.path.exists(IMAGES_DEST_DIR):
         os.makedirs(IMAGES_DEST_DIR)
-
     folder_b_count = 0
     for folder_b in os.listdir(ROOT_DIR):
         folder_b_path = os.path.join(ROOT_DIR, folder_b)

diff --git a/src/format_line_segmentations/formatting_htr_team_file.py b/src/format_line_segmentations/formatting_htr_team_file.py
@@ -2,9 +2,9 @@
 import shutil
 from PIL import Image
 
-ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_team_data'
-XML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_xml_folder'
-IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_images_folder'
+ROOT_DIR = '../../data/htr_team_data'
+XML_DEST_DIR = '../../data/htr_teams/htr_team_xml_folder'
+IMAGES_DEST_DIR = '../../data/htr_teams/htr_team_images_folder'
 
 def convert_to_jpg(src_path, dest_path):
     with Image.open(src_path) as img:
@@ -39,7 +39,6 @@ def main():
         os.makedirs(XML_DEST_DIR)
     if not os.path.exists(IMAGES_DEST_DIR):
         os.makedirs(IMAGES_DEST_DIR)
-
     folder_b_count = 0
     for folder_b in os.listdir(ROOT_DIR):
         folder_b_path = os.path.join(ROOT_DIR, folder_b)

diff --git a/src/format_line_segmentations/formatting_transkribus_file.py b/src/format_line_segmentations/formatting_transkribus_file.py
@@ -0,0 +1,82 @@
+import os
+import shutil
+from PIL import Image
+
+ROOT_DIR = '../../data/transkribus_extracted_data'
+XML_DEST_DIR = '../../data/transkribus_xml_folder'
+IMAGES_DEST_DIR = '../../data/transkribus_images_folder'
+
+def convert_tiff_to_jpg(src_path, dest_path):
+    with Image.open(src_path) as img:
+        img.convert('RGB').save(dest_path, 'JPEG')
+
+def copy_files(xml_folder, folder_c_path, dest_xml_dir, dest_images_dir, prefix=''):
+    xml_files = []
+    image_files = []
+    xml_file_names = {os.path.splitext(file)[0] for file in os.listdir(xml_folder) if file.lower().endswith('.xml')}
+    image_file_names = {os.path.splitext(file)[0] for file in os.listdir(folder_c_path) if file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg', '.png', '.gif'))}
+
+    common_files = xml_file_names & image_file_names
+
+    for file_name in common_files:
+        xml_src = os.path.join(xml_folder, f"{file_name}.xml")
+        xml_dest = os.path.join(dest_xml_dir, f"{prefix}_{file_name}.xml")
+        shutil.copy2(xml_src, xml_dest)
+        xml_files.append(xml_dest)
+
+        image_file = next(file for file in os.listdir(folder_c_path) if os.path.splitext(file)[0] == file_name)
+        image_src = os.path.join(folder_c_path, image_file)
+        image_dest = os.path.join(dest_images_dir, f"{prefix}_{file_name}.jpg")
+
+        if image_file.lower().endswith(('.tif', '.tiff')):
+            convert_tiff_to_jpg(image_src, image_dest)
+        else:
+            shutil.copy2(image_src, image_dest)
+
+        image_files.append(image_dest)
+
+    return xml_files, image_files
+
+def process_folder_C(folder_b_name, folder_c_path, dest_xml_dir, dest_images_dir, folder_c_name):
+    xml_folder = os.path.join(folder_c_path, 'xml')
+
+    if not os.path.exists(xml_folder):
+        print(f"Skipping {folder_c_path} - Missing xml folder.")
+
+    if os.path.exists(xml_folder):
+        prefix = f"{folder_b_name}_{folder_c_name}"
+        xml_files, image_files = copy_files(xml_folder, folder_c_path, dest_xml_dir, dest_images_dir, prefix=prefix)
+        xml_count = len(xml_files)
+        image_count = len(image_files)
+        print(f"Folder {folder_c_path} - XML files: {xml_count}, Image files: {image_count}")
+        if xml_count != image_count:
+            print(f"WARNING: Folder {folder_c_path} has {xml_count} XML files and {image_count} image files.")
+    else:
+        print(f"Skipping {folder_c_path} - No xml folder found.")
+
+def process_folder_B(folder_b_path, dest_xml_dir, dest_images_dir, folder_b_name):
+    for folder_c in os.listdir(folder_b_path):
+        folder_c_path = os.path.join(folder_b_path, folder_c)
+        if os.path.isdir(folder_c_path):
+            print(f"Processing {folder_c_path}...")
+            process_folder_C(folder_b_name, folder_c_path, dest_xml_dir, dest_images_dir, folder_c)
+        else:
+            print(f"Skipping non-directory item: {folder_c_path}")
+
+def main():
+    if not os.path.exists(XML_DEST_DIR):
+        os.makedirs(XML_DEST_DIR)
+    if not os.path.exists(IMAGES_DEST_DIR):
+        os.makedirs(IMAGES_DEST_DIR)
+    folder_b_count = 0
+    for folder_b in os.listdir(ROOT_DIR):
+        folder_b_path = os.path.join(ROOT_DIR, folder_b)
+        if os.path.isdir(folder_b_path):
+            print(f"Processing {folder_b_path}...")
+            process_folder_B(folder_b_path, XML_DEST_DIR, IMAGES_DEST_DIR, folder_b)
+            folder_b_count += 1
+            print(f"Processed {folder_b_count} directories.")
+    print(f"Processed {folder_b_count} 'Folder B' directories.")
+
+if __name__ == "__main__":
+    main()