OpenPecha · ta4tsering · Jun 27, 2024 · Jun 27, 2024 · ta4tsering · Jun 27, 2024
diff --git a/src/format_line_segmentations/checkpoint.txt b/src/format_line_segmentations/checkpoint.txt
diff --git a/src/format_line_segmentations/conversion.py b/src/format_line_segmentations/conversion.py
@@ -142,8 +142,8 @@ def process_google_books_html_files(paths):
 
 # Process XML files for HTR team data
 def process_htr_teams_xml_files(paths):
-    image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_xml"], f)
-                       for f in os.listdir(paths["aws"]["input_xml"]) if f.lower().endswith(".jpg")}
+    image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_images"], f)
+                       for f in os.listdir(paths["aws"]["input_images"]) if f.lower().endswith(".jpg")}
     with open(paths["aws"]["output_jsonl"], 'w', encoding='utf-8') as output_1:
         for filename in os.listdir(paths["aws"]["input_xml"]):
             if filename.endswith(".xml"):
@@ -153,11 +153,10 @@ def process_htr_teams_xml_files(paths):
                     file_path = os.path.join(paths["aws"]["input_xml"], filename)
                     ocr_data = process_xml_file(file_path)
                     image_metadata_1 = extract_metadata_from_xml(ocr_data, image_file_1)
-                    if image_metadata_1:
+                    if ocr_data and image_metadata_1:
                         jsonl_output = convert_to_jsonl(ocr_data, image_metadata_1)
                         output_1.write(jsonl_output + '\n')
-                        xml_root = convert_to_xml(ocr_data, image_metadata_1, "AWS Data",
-                                                  "2024-06-10T11:08:30.326+00:00")
+                        xml_root = convert_to_xml(ocr_data, image_metadata_1, "HTR Team")
                         xml_output = prettify_xml(xml_root)
                         output_file_path = os.path.join(paths["aws"]["output_xml"], f"{file_id}.xml")
                         with open(output_file_path, 'w', encoding='utf-8') as output_file_aws:
@@ -168,17 +167,18 @@ def main():
     base_path = '../../data/line_segmentation_inputs/'
     output_base_path = '../../data/line_segmentation_output_format/'
     paths = {
-           "google_books": {
+        "aws": {
+            "input_xml": f"{base_path}htr_teams/htr_team_xml_folder",
+            "input_images": f"{base_path}htr_teams/htr_team_images_folder/",
+            "output_jsonl": f"{output_base_path}htr_team_data.jsonl",
+            "output_xml": f"{output_base_path}htr_teams_data_xml/"
+        },
+        "google_books": {
             "input_html": f"{base_path}google_books/google_books_html_folder/",
             "input_images": f"{base_path}google_books/google_books_images_folder/",
             "output_jsonl": f"{output_base_path}google_books_data.jsonl",
             "output_xml": f"{output_base_path}google_books_data_xml/"
         },
-        "aws": {
-            "input_xml": f"{base_path}htr_team/",
-            "output_jsonl": f"{output_base_path}htr_team_data.jsonl",
-            "output_xml": f"{output_base_path}htr_teams_data_xml/"
-        },
         "transkribus": {
             "stok_kangyur": {
                 "input_xml_base": f"{base_path}transkrisbus/stok_kangyur/"
@@ -192,9 +192,10 @@ def main():
             "tib_school": {
                 "input_xml_base": f"{base_path}transkrisbus/tib_school/"
             }   
-        }
-    }    
+        } 
+    }   
     create_directories(paths)
+    # Process Html files for Google Books data
     process_google_books_html_files(paths)
     transkribus_datasets = {
         "Transkribus Stok Kangyur": paths["transkribus"]["stok_kangyur"]["input_xml_base"],
@@ -204,7 +205,7 @@ def main():
     }
     for dataset_name, input_xml_base in transkribus_datasets.items():
         input_xml, output_jsonl, output_xml = get_xml_paths(input_xml_base, output_base_path)
-        process_xml_files(input_xml, output_jsonl, output_xml, dataset_name)
+        process_xml_files(input_xml, output_jsonl, output_xml, dataset_name) # Process XML files for Transkribus data
     # Process XML files for HTR team data
     process_htr_teams_xml_files(paths)
 

diff --git a/src/format_line_segmentations/formatting_htr_team_file.py b/src/format_line_segmentations/formatting_htr_team_file.py
@@ -0,0 +1,53 @@
+import os
+import shutil
+from PIL import Image
+
+ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_team_data'
+XML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_xml_folder'
+IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_images_folder'
+
+def convert_to_jpg(src_path, dest_path):
+    with Image.open(src_path) as img:
+        img.convert('RGB').save(dest_path, 'JPEG')
+
+def copy_files(src_dir, dest_xml_dir, dest_images_dir, prefix=''):
+    xml_files = []
+    image_files = []
+    for root, dirs, files in os.walk(src_dir):
+        for file in files:
+            if file.lower().endswith('.xml'):
+                dest_path = os.path.join(dest_xml_dir, f"{prefix}_{file}")
+                shutil.copy2(os.path.join(root, file), dest_path)
+                xml_files.append(dest_path)
+            elif file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg', '.png', '.gif')):
+                dest_path = os.path.join(dest_images_dir, f"{prefix}_{os.path.splitext(file)[0]}.jpg")
+                convert_to_jpg(os.path.join(root, file), dest_path)
+                image_files.append(dest_path)
+    return xml_files, image_files
+
+def process_folder_B(folder_b_path, dest_xml_dir, dest_images_dir):
+    folder_b_name = os.path.basename(folder_b_path)
+    xml_files, image_files = copy_files(folder_b_path, dest_xml_dir, dest_images_dir, prefix=folder_b_name)
+    xml_count = len(xml_files)
+    image_count = len(image_files)
+    print(f"Folder {folder_b_path} - XML files: {xml_count}, Image files: {image_count}")
+    if xml_count != image_count:
+        print(f"WARNING: Folder {folder_b_path} has {xml_count} XML files and {image_count} image files.")
+
+def main():
+    if not os.path.exists(XML_DEST_DIR):
+        os.makedirs(XML_DEST_DIR)
+    if not os.path.exists(IMAGES_DEST_DIR):
+        os.makedirs(IMAGES_DEST_DIR)
+
+    folder_b_count = 0
+    for folder_b in os.listdir(ROOT_DIR):
+        folder_b_path = os.path.join(ROOT_DIR, folder_b)
+        if os.path.isdir(folder_b_path):
+            process_folder_B(folder_b_path, XML_DEST_DIR, IMAGES_DEST_DIR)
+            folder_b_count += 1
+            print(f"Processed {folder_b_count} Folder B directories.")
+    print(f"Total processed 'Folder B' directories: {folder_b_count}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/format_line_segmentations/google_drive_download.py b/src/format_line_segmentations/google_drive_download.py
@@ -9,8 +9,6 @@
 
 # The ID of the Google Drive folder from which to download ZIP files.
 FOLDER_ID = "15Y-PnZBT1JtrZX1ck-RT4Hd1oWU9VA7b"
-
-
 # Local directory to save the downloaded ZIP files.
 DOWNLOAD_PATH = "../../data/google_books_zip/"
 
@@ -20,27 +18,22 @@ def authenticate_google_drive():
     token_pickle = "../../data/token.pickle"
     credentials_file = "../../data/drive_cred.json"
     scopes = ["https://www.googleapis.com/auth/drive.readonly"]
-
     if os.path.exists(token_pickle):
         with open(token_pickle, "rb") as token:
             creds = pickle.load(token)
-
     # If there are no (valid) credentials available, let the user log in.
     if not creds or not creds.valid:
         if creds and creds.expired and creds.refresh_token:
             creds.refresh(Request())
         else:
             flow = InstalledAppFlow.from_client_secrets_file(credentials_file, scopes)
             creds = flow.run_local_server()
-
         # Save the credentials for the next run
         with open(token_pickle, "wb") as token:
             pickle.dump(creds, token)
-
     service = build("drive", "v3", credentials=creds)
     return service
 
-
 def list_zip_files(service, folder_id):
     """List all ZIP files in the specified Google Drive folder."""
     query = f"'{folder_id}' in parents and mimeType='application/zip'"
@@ -51,7 +44,6 @@ def list_zip_files(service, folder_id):
     )
     return results.get("files", [])
 
-
 def download_file(service, file_id, file_name, download_path):
     """Download a file from Google Drive."""
     request = service.files().get_media(fileId=file_id)
@@ -64,17 +56,14 @@ def download_file(service, file_id, file_name, download_path):
         print(f"Downloaded {file_name} {int(status.progress() * 100)}%.")
 
 """check point system"""
-
 CONVERT_CHECKPOINT = Path("checkpoint.txt")
 
 def load_checkpoints():
     if CONVERT_CHECKPOINT.exists():
         return CONVERT_CHECKPOINT.read_text().splitlines()
-
     CONVERT_CHECKPOINT.touch()
     return []
 
-
 def save_checkpoint(file_checkpoint: Path):
     with open(CONVERT_CHECKPOINT, "a") as f:
         f.write(f"{str(file_checkpoint)}\n")
@@ -92,6 +81,5 @@ def main():
         download_file(service, file["id"], file["name"], DOWNLOAD_PATH)
         save_checkpoint(file["name"])
 
-
 if __name__ == "__main__":
     main()
diff --git a/src/format_line_segmentations/xml_parsing.py b/src/format_line_segmentations/xml_parsing.py
@@ -18,7 +18,7 @@ def process_xml_file(file_path):
         print(f"Error processing {file_path}: {e}")
         return []
 
-#Extract metadata from parsed OCR data and the image file.
+#Extract metadata from parsed OCR data and the image file.  
 def extract_metadata_from_xml(ocr_data, image_file):
     metadata = {}
     metadata['id'] = os.path.splitext(os.path.basename(image_file))[0] + ".jpg"