diff --git a/src/format_line_segmentations/checkpoint.txt b/src/format_line_segmentations/checkpoint.txt deleted file mode 100644 index 37867f3..0000000 --- a/src/format_line_segmentations/checkpoint.txt +++ /dev/null @@ -1,100 +0,0 @@ -W00EGS1017319.zip -W1KG13117.zip -W1KG13116.zip -W1KG12960.zip -W1KG12959.zip -W1KG12782.zip -W1KG12766.zip -W1KG12765.zip -W1KG12678.zip -W1KG12675.zip -W1KG12674.zip -W1KG12670.zip -W1KG12669.zip -W1KG12668.zip -W1KG12663.zip -W1KG12662.zip -W1KG12660.zip -W1KG12659.zip -W1KG12630.zip -W1KG12589.zip -W1KG12421.zip -W1KG12273.zip -W1KG11702.zip -W1KG116071.zip -W1KG116070.zip -W1KG1096.zip -W1KG109145.zip -W1KG109100.zip -W1KG109099.zip -W1KG109098.zip -W1KG109097.zip -W1KG109095.zip -W1KG109094.zip -W1KG109088.zip -W1KG10237.zip -W1GS9.zip -W1GS75024.zip -W1GS66332.zip -W1GS60383.zip -W1GS60377.zip -W1GS134940.zip -W1GS108076.zip -W1CZ931.zip -W1AT884.zip -W1AC56.zip -W1AC5.zip -W1AC466.zip -W1AC465.zip -W1AC464.zip -W1AC463.zip -W1AC462.zip -W1AC458.zip -W1AC457.zip -W1AC454.zip -W1AC453.zip -W1AC452.zip -W1AC451.zip -W1AC450.zip -W1AC449.zip -W1AC448.zip -W1AC447.zip -W1AC446.zip -W1AC443.zip -W1AC441.zip -W1AC439.zip -W1AC438.zip -W1AC437.zip -W1AC436.zip -W1AC435.zip -W1AC434.zip -W1AC433.zip -W1AC432.zip -W1AC431.zip -W1AC430.zip -W1AC429.zip -W1AC428.zip -W1AC427.zip -W1AC426.zip -W1AC425.zip -W1AC424.zip -W1AC423.zip -W1AC421.zip -W1AC420.zip -W1AC418.zip -W1AC417.zip -W1AC416.zip -W1AC415.zip -W1AC413.zip -W1AC412.zip -W1AC411.zip -W1AC407.zip -W1AC406.zip -W1AC405.zip -W1AC400.zip -W1AC4.zip -W1AC395.zip -W1AC394.zip -W1AC393.zip -W1AC392.zip -W1AC390.zip diff --git a/src/format_line_segmentations/conversion.py b/src/format_line_segmentations/conversion.py index ad13cd8..9977449 100644 --- a/src/format_line_segmentations/conversion.py +++ b/src/format_line_segmentations/conversion.py @@ -142,8 +142,8 @@ def process_google_books_html_files(paths): # Process XML files for HTR team data def process_htr_teams_xml_files(paths): - image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_xml"], f) - for f in os.listdir(paths["aws"]["input_xml"]) if f.lower().endswith(".jpg")} + image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_images"], f) + for f in os.listdir(paths["aws"]["input_images"]) if f.lower().endswith(".jpg")} with open(paths["aws"]["output_jsonl"], 'w', encoding='utf-8') as output_1: for filename in os.listdir(paths["aws"]["input_xml"]): if filename.endswith(".xml"): @@ -153,11 +153,10 @@ def process_htr_teams_xml_files(paths): file_path = os.path.join(paths["aws"]["input_xml"], filename) ocr_data = process_xml_file(file_path) image_metadata_1 = extract_metadata_from_xml(ocr_data, image_file_1) - if image_metadata_1: + if ocr_data and image_metadata_1: jsonl_output = convert_to_jsonl(ocr_data, image_metadata_1) output_1.write(jsonl_output + '\n') - xml_root = convert_to_xml(ocr_data, image_metadata_1, "AWS Data", - "2024-06-10T11:08:30.326+00:00") + xml_root = convert_to_xml(ocr_data, image_metadata_1, "HTR Team") xml_output = prettify_xml(xml_root) output_file_path = os.path.join(paths["aws"]["output_xml"], f"{file_id}.xml") with open(output_file_path, 'w', encoding='utf-8') as output_file_aws: @@ -168,17 +167,18 @@ def main(): base_path = '../../data/line_segmentation_inputs/' output_base_path = '../../data/line_segmentation_output_format/' paths = { - "google_books": { + "aws": { + "input_xml": f"{base_path}htr_teams/htr_team_xml_folder", + "input_images": f"{base_path}htr_teams/htr_team_images_folder/", + "output_jsonl": f"{output_base_path}htr_team_data.jsonl", + "output_xml": f"{output_base_path}htr_teams_data_xml/" + }, + "google_books": { "input_html": f"{base_path}google_books/google_books_html_folder/", "input_images": f"{base_path}google_books/google_books_images_folder/", "output_jsonl": f"{output_base_path}google_books_data.jsonl", "output_xml": f"{output_base_path}google_books_data_xml/" }, - "aws": { - "input_xml": f"{base_path}htr_team/", - "output_jsonl": f"{output_base_path}htr_team_data.jsonl", - "output_xml": f"{output_base_path}htr_teams_data_xml/" - }, "transkribus": { "stok_kangyur": { "input_xml_base": f"{base_path}transkrisbus/stok_kangyur/" @@ -192,9 +192,10 @@ def main(): "tib_school": { "input_xml_base": f"{base_path}transkrisbus/tib_school/" } - } - } + } + } create_directories(paths) + # Process Html files for Google Books data process_google_books_html_files(paths) transkribus_datasets = { "Transkribus Stok Kangyur": paths["transkribus"]["stok_kangyur"]["input_xml_base"], @@ -204,7 +205,7 @@ def main(): } for dataset_name, input_xml_base in transkribus_datasets.items(): input_xml, output_jsonl, output_xml = get_xml_paths(input_xml_base, output_base_path) - process_xml_files(input_xml, output_jsonl, output_xml, dataset_name) + process_xml_files(input_xml, output_jsonl, output_xml, dataset_name) # Process XML files for Transkribus data # Process XML files for HTR team data process_htr_teams_xml_files(paths) diff --git a/src/format_line_segmentations/formatting_htr_team_file.py b/src/format_line_segmentations/formatting_htr_team_file.py new file mode 100644 index 0000000..00c7af3 --- /dev/null +++ b/src/format_line_segmentations/formatting_htr_team_file.py @@ -0,0 +1,53 @@ +import os +import shutil +from PIL import Image + +ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_team_data' +XML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_xml_folder' +IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_images_folder' + +def convert_to_jpg(src_path, dest_path): + with Image.open(src_path) as img: + img.convert('RGB').save(dest_path, 'JPEG') + +def copy_files(src_dir, dest_xml_dir, dest_images_dir, prefix=''): + xml_files = [] + image_files = [] + for root, dirs, files in os.walk(src_dir): + for file in files: + if file.lower().endswith('.xml'): + dest_path = os.path.join(dest_xml_dir, f"{prefix}_{file}") + shutil.copy2(os.path.join(root, file), dest_path) + xml_files.append(dest_path) + elif file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg', '.png', '.gif')): + dest_path = os.path.join(dest_images_dir, f"{prefix}_{os.path.splitext(file)[0]}.jpg") + convert_to_jpg(os.path.join(root, file), dest_path) + image_files.append(dest_path) + return xml_files, image_files + +def process_folder_B(folder_b_path, dest_xml_dir, dest_images_dir): + folder_b_name = os.path.basename(folder_b_path) + xml_files, image_files = copy_files(folder_b_path, dest_xml_dir, dest_images_dir, prefix=folder_b_name) + xml_count = len(xml_files) + image_count = len(image_files) + print(f"Folder {folder_b_path} - XML files: {xml_count}, Image files: {image_count}") + if xml_count != image_count: + print(f"WARNING: Folder {folder_b_path} has {xml_count} XML files and {image_count} image files.") + +def main(): + if not os.path.exists(XML_DEST_DIR): + os.makedirs(XML_DEST_DIR) + if not os.path.exists(IMAGES_DEST_DIR): + os.makedirs(IMAGES_DEST_DIR) + + folder_b_count = 0 + for folder_b in os.listdir(ROOT_DIR): + folder_b_path = os.path.join(ROOT_DIR, folder_b) + if os.path.isdir(folder_b_path): + process_folder_B(folder_b_path, XML_DEST_DIR, IMAGES_DEST_DIR) + folder_b_count += 1 + print(f"Processed {folder_b_count} Folder B directories.") + print(f"Total processed 'Folder B' directories: {folder_b_count}") + +if __name__ == "__main__": + main() diff --git a/src/format_line_segmentations/google_drive_download.py b/src/format_line_segmentations/google_drive_download.py index bc3c467..3839260 100644 --- a/src/format_line_segmentations/google_drive_download.py +++ b/src/format_line_segmentations/google_drive_download.py @@ -9,8 +9,6 @@ # The ID of the Google Drive folder from which to download ZIP files. FOLDER_ID = "15Y-PnZBT1JtrZX1ck-RT4Hd1oWU9VA7b" - - # Local directory to save the downloaded ZIP files. DOWNLOAD_PATH = "../../data/google_books_zip/" @@ -20,11 +18,9 @@ def authenticate_google_drive(): token_pickle = "../../data/token.pickle" credentials_file = "../../data/drive_cred.json" scopes = ["https://www.googleapis.com/auth/drive.readonly"] - if os.path.exists(token_pickle): with open(token_pickle, "rb") as token: creds = pickle.load(token) - # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: @@ -32,15 +28,12 @@ def authenticate_google_drive(): else: flow = InstalledAppFlow.from_client_secrets_file(credentials_file, scopes) creds = flow.run_local_server() - # Save the credentials for the next run with open(token_pickle, "wb") as token: pickle.dump(creds, token) - service = build("drive", "v3", credentials=creds) return service - def list_zip_files(service, folder_id): """List all ZIP files in the specified Google Drive folder.""" query = f"'{folder_id}' in parents and mimeType='application/zip'" @@ -51,7 +44,6 @@ def list_zip_files(service, folder_id): ) return results.get("files", []) - def download_file(service, file_id, file_name, download_path): """Download a file from Google Drive.""" request = service.files().get_media(fileId=file_id) @@ -64,17 +56,14 @@ def download_file(service, file_id, file_name, download_path): print(f"Downloaded {file_name} {int(status.progress() * 100)}%.") """check point system""" - CONVERT_CHECKPOINT = Path("checkpoint.txt") def load_checkpoints(): if CONVERT_CHECKPOINT.exists(): return CONVERT_CHECKPOINT.read_text().splitlines() - CONVERT_CHECKPOINT.touch() return [] - def save_checkpoint(file_checkpoint: Path): with open(CONVERT_CHECKPOINT, "a") as f: f.write(f"{str(file_checkpoint)}\n") @@ -92,6 +81,5 @@ def main(): download_file(service, file["id"], file["name"], DOWNLOAD_PATH) save_checkpoint(file["name"]) - if __name__ == "__main__": main() \ No newline at end of file diff --git a/src/format_line_segmentations/xml_parsing.py b/src/format_line_segmentations/xml_parsing.py index 96926d1..07e90da 100644 --- a/src/format_line_segmentations/xml_parsing.py +++ b/src/format_line_segmentations/xml_parsing.py @@ -18,7 +18,7 @@ def process_xml_file(file_path): print(f"Error processing {file_path}: {e}") return [] -#Extract metadata from parsed OCR data and the image file. +#Extract metadata from parsed OCR data and the image file. def extract_metadata_from_xml(ocr_data, image_file): metadata = {} metadata['id'] = os.path.splitext(os.path.basename(image_file))[0] + ".jpg"