diff --git a/src/format_line_segmentations/checkpoint.txt b/src/format_line_segmentations/checkpoint.txt new file mode 100644 index 0000000..37867f3 --- /dev/null +++ b/src/format_line_segmentations/checkpoint.txt @@ -0,0 +1,100 @@ +W00EGS1017319.zip +W1KG13117.zip +W1KG13116.zip +W1KG12960.zip +W1KG12959.zip +W1KG12782.zip +W1KG12766.zip +W1KG12765.zip +W1KG12678.zip +W1KG12675.zip +W1KG12674.zip +W1KG12670.zip +W1KG12669.zip +W1KG12668.zip +W1KG12663.zip +W1KG12662.zip +W1KG12660.zip +W1KG12659.zip +W1KG12630.zip +W1KG12589.zip +W1KG12421.zip +W1KG12273.zip +W1KG11702.zip +W1KG116071.zip +W1KG116070.zip +W1KG1096.zip +W1KG109145.zip +W1KG109100.zip +W1KG109099.zip +W1KG109098.zip +W1KG109097.zip +W1KG109095.zip +W1KG109094.zip +W1KG109088.zip +W1KG10237.zip +W1GS9.zip +W1GS75024.zip +W1GS66332.zip +W1GS60383.zip +W1GS60377.zip +W1GS134940.zip +W1GS108076.zip +W1CZ931.zip +W1AT884.zip +W1AC56.zip +W1AC5.zip +W1AC466.zip +W1AC465.zip +W1AC464.zip +W1AC463.zip +W1AC462.zip +W1AC458.zip +W1AC457.zip +W1AC454.zip +W1AC453.zip +W1AC452.zip +W1AC451.zip +W1AC450.zip +W1AC449.zip +W1AC448.zip +W1AC447.zip +W1AC446.zip +W1AC443.zip +W1AC441.zip +W1AC439.zip +W1AC438.zip +W1AC437.zip +W1AC436.zip +W1AC435.zip +W1AC434.zip +W1AC433.zip +W1AC432.zip +W1AC431.zip +W1AC430.zip +W1AC429.zip +W1AC428.zip +W1AC427.zip +W1AC426.zip +W1AC425.zip +W1AC424.zip +W1AC423.zip +W1AC421.zip +W1AC420.zip +W1AC418.zip +W1AC417.zip +W1AC416.zip +W1AC415.zip +W1AC413.zip +W1AC412.zip +W1AC411.zip +W1AC407.zip +W1AC406.zip +W1AC405.zip +W1AC400.zip +W1AC4.zip +W1AC395.zip +W1AC394.zip +W1AC393.zip +W1AC392.zip +W1AC390.zip diff --git a/src/format_line_segmentations/conversion.py b/src/format_line_segmentations/conversion.py index b24ba07..ad13cd8 100644 --- a/src/format_line_segmentations/conversion.py +++ b/src/format_line_segmentations/conversion.py @@ -24,16 +24,12 @@ def convert_to_jsonl(ocr_data, image_metadata): combined_output = { "id": image_metadata["id"], "image": image_metadata["image"], - "spans": spans, - "_input_hash": -548459323, - "_task_hash": -1621366528, - "_view_id": "image_manual", - "answer": "accept" + "spans": spans } return json.dumps(combined_output, ensure_ascii=False) #Convert OCR data and image metadata to an XML format. -def convert_to_xml(ocr_data, image_metadata, creator_name, created_time): +def convert_to_xml(ocr_data, image_metadata, creator_name): root = ET.Element("PcGts", { "xmlns": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15", "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", @@ -42,13 +38,6 @@ def convert_to_xml(ocr_data, image_metadata, creator_name, created_time): metadata = ET.SubElement(root, "Metadata") creator = ET.SubElement(metadata, "Creator") creator.text = creator_name - created = ET.SubElement(metadata, "Created") - created.text = created_time - now_last_changed = datetime.now() - formatted_now_last_changed = now_last_changed.strftime('%Y-%m-%dT%H:%M:%S.%f%z') - formatted_now_last_changed = formatted_now_last_changed[:-3] + "+00:00" - last_changed = ET.SubElement(metadata, "LastChanged") - last_changed.text = formatted_now_last_changed page = ET.SubElement(root, "Page", { "imageFilename": image_metadata["id"], }) @@ -145,8 +134,7 @@ def process_google_books_html_files(paths): if ocr_data and image_metadata_0: jsonl_output = convert_to_jsonl(ocr_data, image_metadata_0) output_0 .write(jsonl_output + '\n') - xml_root = convert_to_xml(ocr_data, image_metadata_0, "Google Books", - "2024-06-10T11:08:30.326+00:00") + xml_root = convert_to_xml(ocr_data, image_metadata_0, "Google Books") xml_output = prettify_xml(xml_root) output_file_path = os.path.join(paths["google_books"]["output_xml"], f"{file_id}.xml") with open(output_file_path, 'w', encoding='utf-8') as output_file_google_books: @@ -180,9 +168,9 @@ def main(): base_path = '../../data/line_segmentation_inputs/' output_base_path = '../../data/line_segmentation_output_format/' paths = { - "google_books": { - "input_html": f"{base_path}google_book_html/", - "input_images": f"{base_path}google_book_images/", + "google_books": { + "input_html": f"{base_path}google_books/google_books_html_folder/", + "input_images": f"{base_path}google_books/google_books_images_folder/", "output_jsonl": f"{output_base_path}google_books_data.jsonl", "output_xml": f"{output_base_path}google_books_data_xml/" }, @@ -192,30 +180,34 @@ def main(): "output_xml": f"{output_base_path}htr_teams_data_xml/" }, "transkribus": { - "stock_kangyur": { - "input_xml_base": f"{base_path}transkrisbus/stock_kangyur/" + "stok_kangyur": { + "input_xml_base": f"{base_path}transkrisbus/stok_kangyur/" }, "phudrak": { "input_xml_base": f"{base_path}transkrisbus/phudrak/" }, "derge_kangyur": { "input_xml_base": f"{base_path}transkrisbus/derge-kangyur/" - } + }, + "tib_school": { + "input_xml_base": f"{base_path}transkrisbus/tib_school/" + } } } create_directories(paths) process_google_books_html_files(paths) transkribus_datasets = { - "Transkribus Stock Kangyur": paths["transkribus"]["stock_kangyur"]["input_xml_base"], + "Transkribus Stok Kangyur": paths["transkribus"]["stok_kangyur"]["input_xml_base"], "Transkribus Phudrak": paths["transkribus"]["phudrak"]["input_xml_base"], - "Transkribus Derge Kangyur": paths["transkribus"]["derge_kangyur"]["input_xml_base"] + "Transkribus Derge Kangyur": paths["transkribus"]["derge_kangyur"]["input_xml_base"], + "Transkribus Derge Kangyur": paths["transkribus"]["tib_school"]["input_xml_base"] } for dataset_name, input_xml_base in transkribus_datasets.items(): input_xml, output_jsonl, output_xml = get_xml_paths(input_xml_base, output_base_path) process_xml_files(input_xml, output_jsonl, output_xml, dataset_name) # Process XML files for HTR team data process_htr_teams_xml_files(paths) - + if __name__ == "__main__": main() diff --git a/src/format_line_segmentations/extract_zip.py b/src/format_line_segmentations/extract_zip.py new file mode 100644 index 0000000..e16a5b7 --- /dev/null +++ b/src/format_line_segmentations/extract_zip.py @@ -0,0 +1,41 @@ +import os +import zipfile + + +def extract_zip(zip_path, extract_to): + """ + Extracts a ZIP file to a specified location. + """ + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(extract_to) + + +def find_and_extract_zip(root_path, output_path): + """ + Recursively finds ZIP files in the given directory and subdirectories, + and extracts them. + """ + for root, dirs, files in os.walk(root_path): + for filename in files: + if filename.endswith(".zip"): + # Correctly construct zip_path from the current root + zip_path = os.path.join(root, filename) + print(f"Extracting: {zip_path}") + # Prepare a corresponding output directory within output_path + relative_root = os.path.relpath(root, start=root_path) + extract_to = os.path.join( + output_path, relative_root, os.path.splitext(filename)[0] + ) + if not os.path.exists(extract_to): + os.makedirs(extract_to) + extract_zip(zip_path, extract_to) + # Recursively search the newly extracted directory for more ZIP files + find_and_extract_zip( + extract_to, extract_to + ) # Adjusted to use extract_to for both parameters + + +if __name__ == "__main__": + root_path = "../../data/tib_school_zip" + output_path = "../../data/tib_school_extracted_data" + find_and_extract_zip(root_path, output_path) \ No newline at end of file diff --git a/src/format_line_segmentations/formatting_google_books_file.py b/src/format_line_segmentations/formatting_google_books_file.py new file mode 100644 index 0000000..6aeac93 --- /dev/null +++ b/src/format_line_segmentations/formatting_google_books_file.py @@ -0,0 +1,67 @@ +import os +import shutil +from PIL import Image + +ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/extracted_data' +HTML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_html_folder' +IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_images_folder' + +def convert_tiff_to_jpg(src_path, dest_path): + with Image.open(src_path) as img: + img.convert('RGB').save(dest_path, 'JPEG') + +def copy_files(src_dir, dest_html_dir, dest_images_dir, prefix=''): + html_files = [] + image_files = [] + for root, dirs, files in os.walk(src_dir): + for file in files: + if file.lower().endswith('.html'): + dest_path = os.path.join(dest_html_dir, f"{prefix}_{file}") + shutil.copy2(os.path.join(root, file), dest_path) + html_files.append(dest_path) + elif file.lower().endswith('.tif') or file.lower().endswith('.tiff'): + dest_path = os.path.join(dest_images_dir, f"{prefix}_{os.path.splitext(file)[0]}.jpg") + convert_tiff_to_jpg(os.path.join(root, file), dest_path) + image_files.append(dest_path) + elif file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif')): + dest_path = os.path.join(dest_images_dir, f"{prefix}_{file}") + shutil.copy2(os.path.join(root, file), dest_path) + image_files.append(dest_path) + + return html_files, image_files + +def process_folder_B(folder_b_path, dest_html_dir, dest_images_dir): + for folder_c in os.listdir(folder_b_path): + folder_c_path = os.path.join(folder_b_path, folder_c) + if os.path.isdir(folder_c_path): + html_folder = os.path.join(folder_c_path, 'html') + images_folder = os.path.join(folder_c_path, 'images') + if os.path.exists(html_folder) and os.path.exists(images_folder): + html_files, image_files = copy_files(folder_c_path, dest_html_dir, dest_images_dir, prefix=os.path.basename(folder_c_path)) + html_count = len(html_files) + image_count = len(image_files) + print(f"Folder {folder_c_path} - HTML files: {html_count}, Image files: {image_count}") + if html_count != image_count: + print(f"WARNING: Folder {folder_c_path} has {html_count} HTML files and {image_count} image files.") + else: + print(f"Skipping {folder_c_path} - Missing html or images folder.") + else: + print(f"Skipping non-directory item: {folder_c_path}") + +def main(): + if not os.path.exists(HTML_DEST_DIR): + os.makedirs(HTML_DEST_DIR) + if not os.path.exists(IMAGES_DEST_DIR): + os.makedirs(IMAGES_DEST_DIR) + + folder_b_count = 0 + for folder_b in os.listdir(ROOT_DIR): + folder_b_path = os.path.join(ROOT_DIR, folder_b) + if os.path.isdir(folder_b_path): + process_folder_B(folder_b_path, HTML_DEST_DIR, IMAGES_DEST_DIR) + folder_b_count += 1 + print(folder_b_count) + print(f"Processed {folder_b_count} 'foldername B' directories.") + +if __name__ == "__main__": + main() diff --git a/src/format_line_segmentations/google_drive_download.py b/src/format_line_segmentations/google_drive_download.py new file mode 100644 index 0000000..bc3c467 --- /dev/null +++ b/src/format_line_segmentations/google_drive_download.py @@ -0,0 +1,97 @@ +import io +import os +import pickle +from pathlib import Path +from google.auth.transport.requests import Request # Add this import +from google_auth_oauthlib.flow import InstalledAppFlow +from googleapiclient.discovery import build +from googleapiclient.http import MediaIoBaseDownload + +# The ID of the Google Drive folder from which to download ZIP files. +FOLDER_ID = "15Y-PnZBT1JtrZX1ck-RT4Hd1oWU9VA7b" + + +# Local directory to save the downloaded ZIP files. +DOWNLOAD_PATH = "../../data/google_books_zip/" + +def authenticate_google_drive(): + """Authenticate and return a Google Drive service instance.""" + creds = None + token_pickle = "../../data/token.pickle" + credentials_file = "../../data/drive_cred.json" + scopes = ["https://www.googleapis.com/auth/drive.readonly"] + + if os.path.exists(token_pickle): + with open(token_pickle, "rb") as token: + creds = pickle.load(token) + + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file(credentials_file, scopes) + creds = flow.run_local_server() + + # Save the credentials for the next run + with open(token_pickle, "wb") as token: + pickle.dump(creds, token) + + service = build("drive", "v3", credentials=creds) + return service + + +def list_zip_files(service, folder_id): + """List all ZIP files in the specified Google Drive folder.""" + query = f"'{folder_id}' in parents and mimeType='application/zip'" + results = ( + service.files() + .list(q=query, spaces="drive", fields="nextPageToken, files(id, name)") + .execute() + ) + return results.get("files", []) + + +def download_file(service, file_id, file_name, download_path): + """Download a file from Google Drive.""" + request = service.files().get_media(fileId=file_id) + file_path = os.path.join(download_path, file_name) + fh = io.FileIO(file_path, "wb") + downloader = MediaIoBaseDownload(fh, request) + done = False + while not done: + status, done = downloader.next_chunk() + print(f"Downloaded {file_name} {int(status.progress() * 100)}%.") + +"""check point system""" + +CONVERT_CHECKPOINT = Path("checkpoint.txt") + +def load_checkpoints(): + if CONVERT_CHECKPOINT.exists(): + return CONVERT_CHECKPOINT.read_text().splitlines() + + CONVERT_CHECKPOINT.touch() + return [] + + +def save_checkpoint(file_checkpoint: Path): + with open(CONVERT_CHECKPOINT, "a") as f: + f.write(f"{str(file_checkpoint)}\n") + +def main(): + checkpoints = load_checkpoints() + service = authenticate_google_drive() + if not os.path.exists(DOWNLOAD_PATH): + os.makedirs(DOWNLOAD_PATH) + zip_files = list_zip_files(service, FOLDER_ID) + for file in zip_files: + if file["name"] in checkpoints: + continue + print(f"Downloading {file['name']}...") + download_file(service, file["id"], file["name"], DOWNLOAD_PATH) + save_checkpoint(file["name"]) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/format_line_segmentations/html_parsing.py b/src/format_line_segmentations/html_parsing.py index 693718a..24590ce 100644 --- a/src/format_line_segmentations/html_parsing.py +++ b/src/format_line_segmentations/html_parsing.py @@ -21,6 +21,6 @@ def process_html_file(file_path): def extract_metadata_from_html(parsed_data, image_file): metadata = {} # Extract ID from the image filename - metadata['id'] = os.path.splitext(os.path.basename(image_file))[0] + ".jpg_2000x700.jpg" + metadata['id'] = os.path.splitext(os.path.basename(image_file))[0] + ".jpg" metadata['image'] = image_file return metadata diff --git a/src/format_line_segmentations/xml_parsing.py b/src/format_line_segmentations/xml_parsing.py index b2e923e..96926d1 100644 --- a/src/format_line_segmentations/xml_parsing.py +++ b/src/format_line_segmentations/xml_parsing.py @@ -21,6 +21,6 @@ def process_xml_file(file_path): #Extract metadata from parsed OCR data and the image file. def extract_metadata_from_xml(ocr_data, image_file): metadata = {} - metadata['id'] = os.path.splitext(os.path.basename(image_file))[0] + ".jpg_2000x700.jpg" + metadata['id'] = os.path.splitext(os.path.basename(image_file))[0] + ".jpg" metadata['image'] = image_file return metadata