Skip to content

Commit

Permalink
fix: converted the tranksribus collection into jsonl and xml format f…
Browse files Browse the repository at this point in the history
…or line segmentation
  • Loading branch information
jim-gyas committed Jul 3, 2024
1 parent 9bfa1f1 commit 0aa2c87
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 102 deletions.
127 changes: 36 additions & 91 deletions src/format_line_segmentations/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,50 +74,6 @@ def create_directories(paths):
if not os.path.exists(output_dir):
os.makedirs(output_dir)

# Process Transkrisbus Data (XML) files for each directory
def get_xml_paths(base_path, output_base_path):
input_dirs = []
output_jsonls = []
output_xmls = []
for root, dirs, files in os.walk(base_path):
if ('xml' in os.path.basename(root).lower() or 'page' in os.path.basename(root).lower()) and any(file.endswith(".xml") for file in files):
input_dirs.append(root)
relative_path = os.path.relpath(root, base_path)
jsonl_name = relative_path.replace(os.sep, '_') + '.jsonl'
xml_dir_name = relative_path.replace(os.sep, '_') + '_xml'
output_jsonls.append(os.path.join(output_base_path, jsonl_name))
output_xmls.append(os.path.join(output_base_path, xml_dir_name))
return input_dirs, output_jsonls, output_xmls

#Process XML files for Transkribus data
def process_xml_files(input_directories, output_files, output_directories, dataset_name):
for input_directory, output_file, output_directory in zip(input_directories, output_files, output_directories):
if not os.path.exists(output_directory):
os.makedirs(output_directory)
# Filter XML files excluding 'metadata.xml' and 'mets.xml'
image_files = {
os.path.splitext(f)[0]: os.path.join(input_directory, f)
for f in os.listdir(input_directory)
if f.lower().endswith(".xml") and not (f.lower().startswith("metadata") or f.lower() == "mets.xml")
}
with open(output_file, 'w', encoding='utf-8') as output_f:
for filename in os.listdir(input_directory):
if filename.lower().endswith(".xml") and not (filename.lower().startswith("metadata") or filename.lower() == "mets.xml"):
file_id = os.path.splitext(filename)[0]
image_file = image_files.get(file_id)
if image_file:
file_path = os.path.join(input_directory, filename)
ocr_data = process_xml_file(file_path)
image_metadata = extract_metadata_from_xml(ocr_data, image_file)
if image_metadata:
jsonl_output = convert_to_jsonl(ocr_data, image_metadata)
output_f.write(jsonl_output + '\n')
xml_root = convert_to_xml(ocr_data, image_metadata, dataset_name, "2024-06-10T11:08:30.326+00:00")
xml_output = prettify_xml(xml_root)
output_file_path = os.path.join(output_directory, f"{file_id}.xml")
with open(output_file_path, 'w', encoding='utf-8') as output_xml:
output_xml.write(xml_output)

# Process Google Books Data (HTML) files
def process_google_books_html_files(paths):
image_files_html = {os.path.splitext(f)[0]: os.path.join(paths["google_books"]["input_images"], f)
Expand All @@ -140,27 +96,34 @@ def process_google_books_html_files(paths):
with open(output_file_path, 'w', encoding='utf-8') as output_file_google_books:
output_file_google_books.write(xml_output)

# Process XML files for HTR team data
def process_htr_teams_xml_files(paths):
image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_images"], f)
for f in os.listdir(paths["aws"]["input_images"]) if f.lower().endswith(".jpg")}
with open(paths["aws"]["output_jsonl"], 'w', encoding='utf-8') as output_1:
for filename in os.listdir(paths["aws"]["input_xml"]):
if filename.endswith(".xml"):
file_id = os.path.splitext(filename)[0]
image_file_1 = image_files_xml.get(file_id)
if image_file_1:
file_path = os.path.join(paths["aws"]["input_xml"], filename)
ocr_data = process_xml_file(file_path)
image_metadata_1 = extract_metadata_from_xml(ocr_data, image_file_1)
if ocr_data and image_metadata_1:
jsonl_output = convert_to_jsonl(ocr_data, image_metadata_1)
output_1.write(jsonl_output + '\n')
xml_root = convert_to_xml(ocr_data, image_metadata_1, "HTR Team")
xml_output = prettify_xml(xml_root)
output_file_path = os.path.join(paths["aws"]["output_xml"], f"{file_id}.xml")
with open(output_file_path, 'w', encoding='utf-8') as output_file_aws:
output_file_aws.write(xml_output)
# Process XML files for Transkribus and HTR team data
def process_xml_data(paths):
for dataset, dataset_paths in paths.items():
if dataset == "transkribus":
image_files = {os.path.splitext(f)[0]: os.path.join(dataset_paths["input_images"], f)
for f in os.listdir(dataset_paths["input_images"]) if f.lower().endswith(".jpg")}
elif dataset == "aws":
image_files = {os.path.splitext(f)[0]: os.path.join(dataset_paths["input_images"], f)
for f in os.listdir(dataset_paths["input_images"]) if f.lower().endswith(".jpg")}
else:
continue
with open(dataset_paths["output_jsonl"], 'w', encoding='utf-8') as output_jsonl:
for filename in os.listdir(dataset_paths["input_xml_base"]):
if filename.endswith(".xml"):
file_id = os.path.splitext(filename)[0]
image_file = image_files.get(file_id)
if image_file:
file_path = os.path.join(dataset_paths["input_xml_base"], filename)
ocr_data = process_xml_file(file_path)
image_metadata = extract_metadata_from_xml(ocr_data, image_file)
if ocr_data and image_metadata:
jsonl_output = convert_to_jsonl(ocr_data, image_metadata)
output_jsonl.write(jsonl_output + '\n')
xml_root = convert_to_xml(ocr_data, image_metadata, dataset.capitalize())
xml_output = prettify_xml(xml_root)
output_file_path = os.path.join(dataset_paths["output_xml"], f"{file_id}.xml")
with open(output_file_path, 'w', encoding='utf-8') as output_xml:
output_xml.write(xml_output)

# Main function to process HTML and XML files and convert them to JSONL and XML formats.
def main():
Expand All @@ -178,37 +141,19 @@ def main():
"input_images": f"{base_path}google_books/google_books_images_folder/",
"output_jsonl": f"{output_base_path}google_books_data.jsonl",
"output_xml": f"{output_base_path}google_books_data_xml/"
},
},
"transkribus": {
"stok_kangyur": {
"input_xml_base": f"{base_path}transkrisbus/stok_kangyur/"
},
"phudrak": {
"input_xml_base": f"{base_path}transkrisbus/phudrak/"
},
"derge_kangyur": {
"input_xml_base": f"{base_path}transkrisbus/derge-kangyur/"
},
"tib_school": {
"input_xml_base": f"{base_path}transkrisbus/tib_school/"
}
}
"input_xml_base": f"{base_path}transkribus/",
"input_images": f"{base_path}transkribus_images_folder/",
"output_jsonl": f"{output_base_path}transkribus_data.jsonl",
"output_xml": f"{output_base_path}transkribus_data_xml/"
}
}
create_directories(paths)
# Process Html files for Google Books data
process_google_books_html_files(paths)
transkribus_datasets = {
"Transkribus Stok Kangyur": paths["transkribus"]["stok_kangyur"]["input_xml_base"],
"Transkribus Phudrak": paths["transkribus"]["phudrak"]["input_xml_base"],
"Transkribus Derge Kangyur": paths["transkribus"]["derge_kangyur"]["input_xml_base"],
"Transkribus Derge Kangyur": paths["transkribus"]["tib_school"]["input_xml_base"]
}
for dataset_name, input_xml_base in transkribus_datasets.items():
input_xml, output_jsonl, output_xml = get_xml_paths(input_xml_base, output_base_path)
process_xml_files(input_xml, output_jsonl, output_xml, dataset_name) # Process XML files for Transkribus data
# Process XML files for HTR team data
process_htr_teams_xml_files(paths)
# Process XML files for Tanskribus and HTR team data
process_xml_data(paths)


if __name__ == "__main__":
main()
6 changes: 3 additions & 3 deletions src/format_line_segmentations/extract_zip.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
import os
import zipfile


def extract_zip(zip_path, extract_to):
"""
Extracts a ZIP file to a specified location.
"""
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_to)


def find_and_extract_zip(root_path, output_path):
"""
Recursively finds ZIP files in the given directory and subdirectories,
and extracts them.
"""
count =0
for root, dirs, files in os.walk(root_path):
for filename in files:
if filename.endswith(".zip"):
Expand All @@ -33,7 +32,8 @@ def find_and_extract_zip(root_path, output_path):
find_and_extract_zip(
extract_to, extract_to
) # Adjusted to use extract_to for both parameters

count=count+1
print(count)

if __name__ == "__main__":
root_path = "../../data/tib_school_zip"
Expand Down
7 changes: 3 additions & 4 deletions src/format_line_segmentations/formatting_google_books_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import shutil
from PIL import Image

ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/extracted_data'
HTML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_html_folder'
IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_images_folder'
ROOT_DIR = '../../data/extracted_data'
HTML_DEST_DIR = '../../data/google_books_html_folder'
IMAGES_DEST_DIR = '../../data/google_books_images_folder'

def convert_tiff_to_jpg(src_path, dest_path):
with Image.open(src_path) as img:
Expand Down Expand Up @@ -53,7 +53,6 @@ def main():
os.makedirs(HTML_DEST_DIR)
if not os.path.exists(IMAGES_DEST_DIR):
os.makedirs(IMAGES_DEST_DIR)

folder_b_count = 0
for folder_b in os.listdir(ROOT_DIR):
folder_b_path = os.path.join(ROOT_DIR, folder_b)
Expand Down
7 changes: 3 additions & 4 deletions src/format_line_segmentations/formatting_htr_team_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import shutil
from PIL import Image

ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_team_data'
XML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_xml_folder'
IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_images_folder'
ROOT_DIR = '../../data/htr_team_data'
XML_DEST_DIR = '../../data/htr_teams/htr_team_xml_folder'
IMAGES_DEST_DIR = '../../data/htr_teams/htr_team_images_folder'

def convert_to_jpg(src_path, dest_path):
with Image.open(src_path) as img:
Expand Down Expand Up @@ -39,7 +39,6 @@ def main():
os.makedirs(XML_DEST_DIR)
if not os.path.exists(IMAGES_DEST_DIR):
os.makedirs(IMAGES_DEST_DIR)

folder_b_count = 0
for folder_b in os.listdir(ROOT_DIR):
folder_b_path = os.path.join(ROOT_DIR, folder_b)
Expand Down
82 changes: 82 additions & 0 deletions src/format_line_segmentations/formatting_transkribus_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import shutil
from PIL import Image

ROOT_DIR = '../../data/transkribus_extracted_data'
XML_DEST_DIR = '../../data/transkribus_xml_folder'
IMAGES_DEST_DIR = '../../data/transkribus_images_folder'

def convert_tiff_to_jpg(src_path, dest_path):
with Image.open(src_path) as img:
img.convert('RGB').save(dest_path, 'JPEG')

def copy_files(xml_folder, folder_c_path, dest_xml_dir, dest_images_dir, prefix=''):
xml_files = []
image_files = []
xml_file_names = {os.path.splitext(file)[0] for file in os.listdir(xml_folder) if file.lower().endswith('.xml')}
image_file_names = {os.path.splitext(file)[0] for file in os.listdir(folder_c_path) if file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg', '.png', '.gif'))}

common_files = xml_file_names & image_file_names

for file_name in common_files:
xml_src = os.path.join(xml_folder, f"{file_name}.xml")
xml_dest = os.path.join(dest_xml_dir, f"{prefix}_{file_name}.xml")
shutil.copy2(xml_src, xml_dest)
xml_files.append(xml_dest)

image_file = next(file for file in os.listdir(folder_c_path) if os.path.splitext(file)[0] == file_name)
image_src = os.path.join(folder_c_path, image_file)
image_dest = os.path.join(dest_images_dir, f"{prefix}_{file_name}.jpg")

if image_file.lower().endswith(('.tif', '.tiff')):
convert_tiff_to_jpg(image_src, image_dest)
else:
shutil.copy2(image_src, image_dest)

image_files.append(image_dest)

return xml_files, image_files

def process_folder_C(folder_b_name, folder_c_path, dest_xml_dir, dest_images_dir, folder_c_name):
xml_folder = os.path.join(folder_c_path, 'xml')

if not os.path.exists(xml_folder):
print(f"Skipping {folder_c_path} - Missing xml folder.")

if os.path.exists(xml_folder):
prefix = f"{folder_b_name}_{folder_c_name}"
xml_files, image_files = copy_files(xml_folder, folder_c_path, dest_xml_dir, dest_images_dir, prefix=prefix)
xml_count = len(xml_files)
image_count = len(image_files)
print(f"Folder {folder_c_path} - XML files: {xml_count}, Image files: {image_count}")
if xml_count != image_count:
print(f"WARNING: Folder {folder_c_path} has {xml_count} XML files and {image_count} image files.")
else:
print(f"Skipping {folder_c_path} - No xml folder found.")

def process_folder_B(folder_b_path, dest_xml_dir, dest_images_dir, folder_b_name):
for folder_c in os.listdir(folder_b_path):
folder_c_path = os.path.join(folder_b_path, folder_c)
if os.path.isdir(folder_c_path):
print(f"Processing {folder_c_path}...")
process_folder_C(folder_b_name, folder_c_path, dest_xml_dir, dest_images_dir, folder_c)
else:
print(f"Skipping non-directory item: {folder_c_path}")

def main():
if not os.path.exists(XML_DEST_DIR):
os.makedirs(XML_DEST_DIR)
if not os.path.exists(IMAGES_DEST_DIR):
os.makedirs(IMAGES_DEST_DIR)
folder_b_count = 0
for folder_b in os.listdir(ROOT_DIR):
folder_b_path = os.path.join(ROOT_DIR, folder_b)
if os.path.isdir(folder_b_path):
print(f"Processing {folder_b_path}...")
process_folder_B(folder_b_path, XML_DEST_DIR, IMAGES_DEST_DIR, folder_b)
folder_b_count += 1
print(f"Processed {folder_b_count} directories.")
print(f"Processed {folder_b_count} 'Folder B' directories.")

if __name__ == "__main__":
main()

0 comments on commit 0aa2c87

Please sign in to comment.