From 0aa2c874df993e86ef42a72262b1a8419f6ecc93 Mon Sep 17 00:00:00 2001 From: jim-gyas Date: Wed, 3 Jul 2024 08:54:45 +0530 Subject: [PATCH] fix: converted the tranksribus collection into jsonl and xml format for line segmentation --- src/format_line_segmentations/conversion.py | 127 +++++------------- src/format_line_segmentations/extract_zip.py | 6 +- .../formatting_google_books_file.py | 7 +- .../formatting_htr_team_file.py | 7 +- .../formatting_transkribus_file.py | 82 +++++++++++ 5 files changed, 127 insertions(+), 102 deletions(-) create mode 100644 src/format_line_segmentations/formatting_transkribus_file.py diff --git a/src/format_line_segmentations/conversion.py b/src/format_line_segmentations/conversion.py index 9977449..a62a626 100644 --- a/src/format_line_segmentations/conversion.py +++ b/src/format_line_segmentations/conversion.py @@ -74,50 +74,6 @@ def create_directories(paths): if not os.path.exists(output_dir): os.makedirs(output_dir) -# Process Transkrisbus Data (XML) files for each directory -def get_xml_paths(base_path, output_base_path): - input_dirs = [] - output_jsonls = [] - output_xmls = [] - for root, dirs, files in os.walk(base_path): - if ('xml' in os.path.basename(root).lower() or 'page' in os.path.basename(root).lower()) and any(file.endswith(".xml") for file in files): - input_dirs.append(root) - relative_path = os.path.relpath(root, base_path) - jsonl_name = relative_path.replace(os.sep, '_') + '.jsonl' - xml_dir_name = relative_path.replace(os.sep, '_') + '_xml' - output_jsonls.append(os.path.join(output_base_path, jsonl_name)) - output_xmls.append(os.path.join(output_base_path, xml_dir_name)) - return input_dirs, output_jsonls, output_xmls - -#Process XML files for Transkribus data -def process_xml_files(input_directories, output_files, output_directories, dataset_name): - for input_directory, output_file, output_directory in zip(input_directories, output_files, output_directories): - if not os.path.exists(output_directory): - os.makedirs(output_directory) - # Filter XML files excluding 'metadata.xml' and 'mets.xml' - image_files = { - os.path.splitext(f)[0]: os.path.join(input_directory, f) - for f in os.listdir(input_directory) - if f.lower().endswith(".xml") and not (f.lower().startswith("metadata") or f.lower() == "mets.xml") - } - with open(output_file, 'w', encoding='utf-8') as output_f: - for filename in os.listdir(input_directory): - if filename.lower().endswith(".xml") and not (filename.lower().startswith("metadata") or filename.lower() == "mets.xml"): - file_id = os.path.splitext(filename)[0] - image_file = image_files.get(file_id) - if image_file: - file_path = os.path.join(input_directory, filename) - ocr_data = process_xml_file(file_path) - image_metadata = extract_metadata_from_xml(ocr_data, image_file) - if image_metadata: - jsonl_output = convert_to_jsonl(ocr_data, image_metadata) - output_f.write(jsonl_output + '\n') - xml_root = convert_to_xml(ocr_data, image_metadata, dataset_name, "2024-06-10T11:08:30.326+00:00") - xml_output = prettify_xml(xml_root) - output_file_path = os.path.join(output_directory, f"{file_id}.xml") - with open(output_file_path, 'w', encoding='utf-8') as output_xml: - output_xml.write(xml_output) - # Process Google Books Data (HTML) files def process_google_books_html_files(paths): image_files_html = {os.path.splitext(f)[0]: os.path.join(paths["google_books"]["input_images"], f) @@ -140,27 +96,34 @@ def process_google_books_html_files(paths): with open(output_file_path, 'w', encoding='utf-8') as output_file_google_books: output_file_google_books.write(xml_output) -# Process XML files for HTR team data -def process_htr_teams_xml_files(paths): - image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_images"], f) - for f in os.listdir(paths["aws"]["input_images"]) if f.lower().endswith(".jpg")} - with open(paths["aws"]["output_jsonl"], 'w', encoding='utf-8') as output_1: - for filename in os.listdir(paths["aws"]["input_xml"]): - if filename.endswith(".xml"): - file_id = os.path.splitext(filename)[0] - image_file_1 = image_files_xml.get(file_id) - if image_file_1: - file_path = os.path.join(paths["aws"]["input_xml"], filename) - ocr_data = process_xml_file(file_path) - image_metadata_1 = extract_metadata_from_xml(ocr_data, image_file_1) - if ocr_data and image_metadata_1: - jsonl_output = convert_to_jsonl(ocr_data, image_metadata_1) - output_1.write(jsonl_output + '\n') - xml_root = convert_to_xml(ocr_data, image_metadata_1, "HTR Team") - xml_output = prettify_xml(xml_root) - output_file_path = os.path.join(paths["aws"]["output_xml"], f"{file_id}.xml") - with open(output_file_path, 'w', encoding='utf-8') as output_file_aws: - output_file_aws.write(xml_output) +# Process XML files for Transkribus and HTR team data +def process_xml_data(paths): + for dataset, dataset_paths in paths.items(): + if dataset == "transkribus": + image_files = {os.path.splitext(f)[0]: os.path.join(dataset_paths["input_images"], f) + for f in os.listdir(dataset_paths["input_images"]) if f.lower().endswith(".jpg")} + elif dataset == "aws": + image_files = {os.path.splitext(f)[0]: os.path.join(dataset_paths["input_images"], f) + for f in os.listdir(dataset_paths["input_images"]) if f.lower().endswith(".jpg")} + else: + continue + with open(dataset_paths["output_jsonl"], 'w', encoding='utf-8') as output_jsonl: + for filename in os.listdir(dataset_paths["input_xml_base"]): + if filename.endswith(".xml"): + file_id = os.path.splitext(filename)[0] + image_file = image_files.get(file_id) + if image_file: + file_path = os.path.join(dataset_paths["input_xml_base"], filename) + ocr_data = process_xml_file(file_path) + image_metadata = extract_metadata_from_xml(ocr_data, image_file) + if ocr_data and image_metadata: + jsonl_output = convert_to_jsonl(ocr_data, image_metadata) + output_jsonl.write(jsonl_output + '\n') + xml_root = convert_to_xml(ocr_data, image_metadata, dataset.capitalize()) + xml_output = prettify_xml(xml_root) + output_file_path = os.path.join(dataset_paths["output_xml"], f"{file_id}.xml") + with open(output_file_path, 'w', encoding='utf-8') as output_xml: + output_xml.write(xml_output) # Main function to process HTML and XML files and convert them to JSONL and XML formats. def main(): @@ -178,37 +141,19 @@ def main(): "input_images": f"{base_path}google_books/google_books_images_folder/", "output_jsonl": f"{output_base_path}google_books_data.jsonl", "output_xml": f"{output_base_path}google_books_data_xml/" - }, + }, "transkribus": { - "stok_kangyur": { - "input_xml_base": f"{base_path}transkrisbus/stok_kangyur/" - }, - "phudrak": { - "input_xml_base": f"{base_path}transkrisbus/phudrak/" - }, - "derge_kangyur": { - "input_xml_base": f"{base_path}transkrisbus/derge-kangyur/" - }, - "tib_school": { - "input_xml_base": f"{base_path}transkrisbus/tib_school/" - } - } + "input_xml_base": f"{base_path}transkribus/", + "input_images": f"{base_path}transkribus_images_folder/", + "output_jsonl": f"{output_base_path}transkribus_data.jsonl", + "output_xml": f"{output_base_path}transkribus_data_xml/" + } } create_directories(paths) # Process Html files for Google Books data process_google_books_html_files(paths) - transkribus_datasets = { - "Transkribus Stok Kangyur": paths["transkribus"]["stok_kangyur"]["input_xml_base"], - "Transkribus Phudrak": paths["transkribus"]["phudrak"]["input_xml_base"], - "Transkribus Derge Kangyur": paths["transkribus"]["derge_kangyur"]["input_xml_base"], - "Transkribus Derge Kangyur": paths["transkribus"]["tib_school"]["input_xml_base"] - } - for dataset_name, input_xml_base in transkribus_datasets.items(): - input_xml, output_jsonl, output_xml = get_xml_paths(input_xml_base, output_base_path) - process_xml_files(input_xml, output_jsonl, output_xml, dataset_name) # Process XML files for Transkribus data - # Process XML files for HTR team data - process_htr_teams_xml_files(paths) + # Process XML files for Tanskribus and HTR team data + process_xml_data(paths) - if __name__ == "__main__": main() diff --git a/src/format_line_segmentations/extract_zip.py b/src/format_line_segmentations/extract_zip.py index e16a5b7..aeb75ab 100644 --- a/src/format_line_segmentations/extract_zip.py +++ b/src/format_line_segmentations/extract_zip.py @@ -1,7 +1,6 @@ import os import zipfile - def extract_zip(zip_path, extract_to): """ Extracts a ZIP file to a specified location. @@ -9,12 +8,12 @@ def extract_zip(zip_path, extract_to): with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(extract_to) - def find_and_extract_zip(root_path, output_path): """ Recursively finds ZIP files in the given directory and subdirectories, and extracts them. """ + count =0 for root, dirs, files in os.walk(root_path): for filename in files: if filename.endswith(".zip"): @@ -33,7 +32,8 @@ def find_and_extract_zip(root_path, output_path): find_and_extract_zip( extract_to, extract_to ) # Adjusted to use extract_to for both parameters - + count=count+1 + print(count) if __name__ == "__main__": root_path = "../../data/tib_school_zip" diff --git a/src/format_line_segmentations/formatting_google_books_file.py b/src/format_line_segmentations/formatting_google_books_file.py index 6aeac93..e296dd6 100644 --- a/src/format_line_segmentations/formatting_google_books_file.py +++ b/src/format_line_segmentations/formatting_google_books_file.py @@ -2,9 +2,9 @@ import shutil from PIL import Image -ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/extracted_data' -HTML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_html_folder' -IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_images_folder' +ROOT_DIR = '../../data/extracted_data' +HTML_DEST_DIR = '../../data/google_books_html_folder' +IMAGES_DEST_DIR = '../../data/google_books_images_folder' def convert_tiff_to_jpg(src_path, dest_path): with Image.open(src_path) as img: @@ -53,7 +53,6 @@ def main(): os.makedirs(HTML_DEST_DIR) if not os.path.exists(IMAGES_DEST_DIR): os.makedirs(IMAGES_DEST_DIR) - folder_b_count = 0 for folder_b in os.listdir(ROOT_DIR): folder_b_path = os.path.join(ROOT_DIR, folder_b) diff --git a/src/format_line_segmentations/formatting_htr_team_file.py b/src/format_line_segmentations/formatting_htr_team_file.py index 00c7af3..0655e52 100644 --- a/src/format_line_segmentations/formatting_htr_team_file.py +++ b/src/format_line_segmentations/formatting_htr_team_file.py @@ -2,9 +2,9 @@ import shutil from PIL import Image -ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_team_data' -XML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_xml_folder' -IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_images_folder' +ROOT_DIR = '../../data/htr_team_data' +XML_DEST_DIR = '../../data/htr_teams/htr_team_xml_folder' +IMAGES_DEST_DIR = '../../data/htr_teams/htr_team_images_folder' def convert_to_jpg(src_path, dest_path): with Image.open(src_path) as img: @@ -39,7 +39,6 @@ def main(): os.makedirs(XML_DEST_DIR) if not os.path.exists(IMAGES_DEST_DIR): os.makedirs(IMAGES_DEST_DIR) - folder_b_count = 0 for folder_b in os.listdir(ROOT_DIR): folder_b_path = os.path.join(ROOT_DIR, folder_b) diff --git a/src/format_line_segmentations/formatting_transkribus_file.py b/src/format_line_segmentations/formatting_transkribus_file.py new file mode 100644 index 0000000..ca4a577 --- /dev/null +++ b/src/format_line_segmentations/formatting_transkribus_file.py @@ -0,0 +1,82 @@ +import os +import shutil +from PIL import Image + +ROOT_DIR = '../../data/transkribus_extracted_data' +XML_DEST_DIR = '../../data/transkribus_xml_folder' +IMAGES_DEST_DIR = '../../data/transkribus_images_folder' + +def convert_tiff_to_jpg(src_path, dest_path): + with Image.open(src_path) as img: + img.convert('RGB').save(dest_path, 'JPEG') + +def copy_files(xml_folder, folder_c_path, dest_xml_dir, dest_images_dir, prefix=''): + xml_files = [] + image_files = [] + xml_file_names = {os.path.splitext(file)[0] for file in os.listdir(xml_folder) if file.lower().endswith('.xml')} + image_file_names = {os.path.splitext(file)[0] for file in os.listdir(folder_c_path) if file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg', '.png', '.gif'))} + + common_files = xml_file_names & image_file_names + + for file_name in common_files: + xml_src = os.path.join(xml_folder, f"{file_name}.xml") + xml_dest = os.path.join(dest_xml_dir, f"{prefix}_{file_name}.xml") + shutil.copy2(xml_src, xml_dest) + xml_files.append(xml_dest) + + image_file = next(file for file in os.listdir(folder_c_path) if os.path.splitext(file)[0] == file_name) + image_src = os.path.join(folder_c_path, image_file) + image_dest = os.path.join(dest_images_dir, f"{prefix}_{file_name}.jpg") + + if image_file.lower().endswith(('.tif', '.tiff')): + convert_tiff_to_jpg(image_src, image_dest) + else: + shutil.copy2(image_src, image_dest) + + image_files.append(image_dest) + + return xml_files, image_files + +def process_folder_C(folder_b_name, folder_c_path, dest_xml_dir, dest_images_dir, folder_c_name): + xml_folder = os.path.join(folder_c_path, 'xml') + + if not os.path.exists(xml_folder): + print(f"Skipping {folder_c_path} - Missing xml folder.") + + if os.path.exists(xml_folder): + prefix = f"{folder_b_name}_{folder_c_name}" + xml_files, image_files = copy_files(xml_folder, folder_c_path, dest_xml_dir, dest_images_dir, prefix=prefix) + xml_count = len(xml_files) + image_count = len(image_files) + print(f"Folder {folder_c_path} - XML files: {xml_count}, Image files: {image_count}") + if xml_count != image_count: + print(f"WARNING: Folder {folder_c_path} has {xml_count} XML files and {image_count} image files.") + else: + print(f"Skipping {folder_c_path} - No xml folder found.") + +def process_folder_B(folder_b_path, dest_xml_dir, dest_images_dir, folder_b_name): + for folder_c in os.listdir(folder_b_path): + folder_c_path = os.path.join(folder_b_path, folder_c) + if os.path.isdir(folder_c_path): + print(f"Processing {folder_c_path}...") + process_folder_C(folder_b_name, folder_c_path, dest_xml_dir, dest_images_dir, folder_c) + else: + print(f"Skipping non-directory item: {folder_c_path}") + +def main(): + if not os.path.exists(XML_DEST_DIR): + os.makedirs(XML_DEST_DIR) + if not os.path.exists(IMAGES_DEST_DIR): + os.makedirs(IMAGES_DEST_DIR) + folder_b_count = 0 + for folder_b in os.listdir(ROOT_DIR): + folder_b_path = os.path.join(ROOT_DIR, folder_b) + if os.path.isdir(folder_b_path): + print(f"Processing {folder_b_path}...") + process_folder_B(folder_b_path, XML_DEST_DIR, IMAGES_DEST_DIR, folder_b) + folder_b_count += 1 + print(f"Processed {folder_b_count} directories.") + print(f"Processed {folder_b_count} 'Folder B' directories.") + +if __name__ == "__main__": + main()