Skip to content

Commit

Permalink
fix:converted all google booksvolumes into jsonl and xml format
Browse files Browse the repository at this point in the history
  • Loading branch information
jim-gyas committed Jun 26, 2024
1 parent 73ac3b9 commit aad4169
Show file tree
Hide file tree
Showing 7 changed files with 323 additions and 26 deletions.
100 changes: 100 additions & 0 deletions src/format_line_segmentations/checkpoint.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
W00EGS1017319.zip
W1KG13117.zip
W1KG13116.zip
W1KG12960.zip
W1KG12959.zip
W1KG12782.zip
W1KG12766.zip
W1KG12765.zip
W1KG12678.zip
W1KG12675.zip
W1KG12674.zip
W1KG12670.zip
W1KG12669.zip
W1KG12668.zip
W1KG12663.zip
W1KG12662.zip
W1KG12660.zip
W1KG12659.zip
W1KG12630.zip
W1KG12589.zip
W1KG12421.zip
W1KG12273.zip
W1KG11702.zip
W1KG116071.zip
W1KG116070.zip
W1KG1096.zip
W1KG109145.zip
W1KG109100.zip
W1KG109099.zip
W1KG109098.zip
W1KG109097.zip
W1KG109095.zip
W1KG109094.zip
W1KG109088.zip
W1KG10237.zip
W1GS9.zip
W1GS75024.zip
W1GS66332.zip
W1GS60383.zip
W1GS60377.zip
W1GS134940.zip
W1GS108076.zip
W1CZ931.zip
W1AT884.zip
W1AC56.zip
W1AC5.zip
W1AC466.zip
W1AC465.zip
W1AC464.zip
W1AC463.zip
W1AC462.zip
W1AC458.zip
W1AC457.zip
W1AC454.zip
W1AC453.zip
W1AC452.zip
W1AC451.zip
W1AC450.zip
W1AC449.zip
W1AC448.zip
W1AC447.zip
W1AC446.zip
W1AC443.zip
W1AC441.zip
W1AC439.zip
W1AC438.zip
W1AC437.zip
W1AC436.zip
W1AC435.zip
W1AC434.zip
W1AC433.zip
W1AC432.zip
W1AC431.zip
W1AC430.zip
W1AC429.zip
W1AC428.zip
W1AC427.zip
W1AC426.zip
W1AC425.zip
W1AC424.zip
W1AC423.zip
W1AC421.zip
W1AC420.zip
W1AC418.zip
W1AC417.zip
W1AC416.zip
W1AC415.zip
W1AC413.zip
W1AC412.zip
W1AC411.zip
W1AC407.zip
W1AC406.zip
W1AC405.zip
W1AC400.zip
W1AC4.zip
W1AC395.zip
W1AC394.zip
W1AC393.zip
W1AC392.zip
W1AC390.zip
40 changes: 16 additions & 24 deletions src/format_line_segmentations/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,12 @@ def convert_to_jsonl(ocr_data, image_metadata):
combined_output = {
"id": image_metadata["id"],
"image": image_metadata["image"],
"spans": spans,
"_input_hash": -548459323,
"_task_hash": -1621366528,
"_view_id": "image_manual",
"answer": "accept"
"spans": spans
}
return json.dumps(combined_output, ensure_ascii=False)

#Convert OCR data and image metadata to an XML format.
def convert_to_xml(ocr_data, image_metadata, creator_name, created_time):
def convert_to_xml(ocr_data, image_metadata, creator_name):
root = ET.Element("PcGts", {
"xmlns": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15",
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
Expand All @@ -42,13 +38,6 @@ def convert_to_xml(ocr_data, image_metadata, creator_name, created_time):
metadata = ET.SubElement(root, "Metadata")
creator = ET.SubElement(metadata, "Creator")
creator.text = creator_name
created = ET.SubElement(metadata, "Created")
created.text = created_time
now_last_changed = datetime.now()
formatted_now_last_changed = now_last_changed.strftime('%Y-%m-%dT%H:%M:%S.%f%z')
formatted_now_last_changed = formatted_now_last_changed[:-3] + "+00:00"
last_changed = ET.SubElement(metadata, "LastChanged")
last_changed.text = formatted_now_last_changed
page = ET.SubElement(root, "Page", {
"imageFilename": image_metadata["id"],
})
Expand Down Expand Up @@ -145,8 +134,7 @@ def process_google_books_html_files(paths):
if ocr_data and image_metadata_0:
jsonl_output = convert_to_jsonl(ocr_data, image_metadata_0)
output_0 .write(jsonl_output + '\n')
xml_root = convert_to_xml(ocr_data, image_metadata_0, "Google Books",
"2024-06-10T11:08:30.326+00:00")
xml_root = convert_to_xml(ocr_data, image_metadata_0, "Google Books")
xml_output = prettify_xml(xml_root)
output_file_path = os.path.join(paths["google_books"]["output_xml"], f"{file_id}.xml")
with open(output_file_path, 'w', encoding='utf-8') as output_file_google_books:
Expand Down Expand Up @@ -180,9 +168,9 @@ def main():
base_path = '../../data/line_segmentation_inputs/'
output_base_path = '../../data/line_segmentation_output_format/'
paths = {
"google_books": {
"input_html": f"{base_path}google_book_html/",
"input_images": f"{base_path}google_book_images/",
"google_books": {
"input_html": f"{base_path}google_books/google_books_html_folder/",
"input_images": f"{base_path}google_books/google_books_images_folder/",
"output_jsonl": f"{output_base_path}google_books_data.jsonl",
"output_xml": f"{output_base_path}google_books_data_xml/"
},
Expand All @@ -192,30 +180,34 @@ def main():
"output_xml": f"{output_base_path}htr_teams_data_xml/"
},
"transkribus": {
"stock_kangyur": {
"input_xml_base": f"{base_path}transkrisbus/stock_kangyur/"
"stok_kangyur": {
"input_xml_base": f"{base_path}transkrisbus/stok_kangyur/"
},
"phudrak": {
"input_xml_base": f"{base_path}transkrisbus/phudrak/"
},
"derge_kangyur": {
"input_xml_base": f"{base_path}transkrisbus/derge-kangyur/"
}
},
"tib_school": {
"input_xml_base": f"{base_path}transkrisbus/tib_school/"
}
}
}
create_directories(paths)
process_google_books_html_files(paths)
transkribus_datasets = {
"Transkribus Stock Kangyur": paths["transkribus"]["stock_kangyur"]["input_xml_base"],
"Transkribus Stok Kangyur": paths["transkribus"]["stok_kangyur"]["input_xml_base"],
"Transkribus Phudrak": paths["transkribus"]["phudrak"]["input_xml_base"],
"Transkribus Derge Kangyur": paths["transkribus"]["derge_kangyur"]["input_xml_base"]
"Transkribus Derge Kangyur": paths["transkribus"]["derge_kangyur"]["input_xml_base"],
"Transkribus Derge Kangyur": paths["transkribus"]["tib_school"]["input_xml_base"]
}
for dataset_name, input_xml_base in transkribus_datasets.items():
input_xml, output_jsonl, output_xml = get_xml_paths(input_xml_base, output_base_path)
process_xml_files(input_xml, output_jsonl, output_xml, dataset_name)
# Process XML files for HTR team data
process_htr_teams_xml_files(paths)


if __name__ == "__main__":
main()
41 changes: 41 additions & 0 deletions src/format_line_segmentations/extract_zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import zipfile


def extract_zip(zip_path, extract_to):
"""
Extracts a ZIP file to a specified location.
"""
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_to)


def find_and_extract_zip(root_path, output_path):
"""
Recursively finds ZIP files in the given directory and subdirectories,
and extracts them.
"""
for root, dirs, files in os.walk(root_path):
for filename in files:
if filename.endswith(".zip"):
# Correctly construct zip_path from the current root
zip_path = os.path.join(root, filename)
print(f"Extracting: {zip_path}")
# Prepare a corresponding output directory within output_path
relative_root = os.path.relpath(root, start=root_path)
extract_to = os.path.join(
output_path, relative_root, os.path.splitext(filename)[0]
)
if not os.path.exists(extract_to):
os.makedirs(extract_to)
extract_zip(zip_path, extract_to)
# Recursively search the newly extracted directory for more ZIP files
find_and_extract_zip(
extract_to, extract_to
) # Adjusted to use extract_to for both parameters


if __name__ == "__main__":
root_path = "../../data/tib_school_zip"
output_path = "../../data/tib_school_extracted_data"
find_and_extract_zip(root_path, output_path)
67 changes: 67 additions & 0 deletions src/format_line_segmentations/formatting_google_books_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import shutil
from PIL import Image

ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/extracted_data'
HTML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_html_folder'
IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/google_books_images_folder'

def convert_tiff_to_jpg(src_path, dest_path):
with Image.open(src_path) as img:
img.convert('RGB').save(dest_path, 'JPEG')

def copy_files(src_dir, dest_html_dir, dest_images_dir, prefix=''):
html_files = []
image_files = []
for root, dirs, files in os.walk(src_dir):
for file in files:
if file.lower().endswith('.html'):
dest_path = os.path.join(dest_html_dir, f"{prefix}_{file}")
shutil.copy2(os.path.join(root, file), dest_path)
html_files.append(dest_path)
elif file.lower().endswith('.tif') or file.lower().endswith('.tiff'):
dest_path = os.path.join(dest_images_dir, f"{prefix}_{os.path.splitext(file)[0]}.jpg")
convert_tiff_to_jpg(os.path.join(root, file), dest_path)
image_files.append(dest_path)
elif file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif')):
dest_path = os.path.join(dest_images_dir, f"{prefix}_{file}")
shutil.copy2(os.path.join(root, file), dest_path)
image_files.append(dest_path)

return html_files, image_files

def process_folder_B(folder_b_path, dest_html_dir, dest_images_dir):
for folder_c in os.listdir(folder_b_path):
folder_c_path = os.path.join(folder_b_path, folder_c)
if os.path.isdir(folder_c_path):
html_folder = os.path.join(folder_c_path, 'html')
images_folder = os.path.join(folder_c_path, 'images')
if os.path.exists(html_folder) and os.path.exists(images_folder):
html_files, image_files = copy_files(folder_c_path, dest_html_dir, dest_images_dir, prefix=os.path.basename(folder_c_path))
html_count = len(html_files)
image_count = len(image_files)
print(f"Folder {folder_c_path} - HTML files: {html_count}, Image files: {image_count}")
if html_count != image_count:
print(f"WARNING: Folder {folder_c_path} has {html_count} HTML files and {image_count} image files.")
else:
print(f"Skipping {folder_c_path} - Missing html or images folder.")
else:
print(f"Skipping non-directory item: {folder_c_path}")

def main():
if not os.path.exists(HTML_DEST_DIR):
os.makedirs(HTML_DEST_DIR)
if not os.path.exists(IMAGES_DEST_DIR):
os.makedirs(IMAGES_DEST_DIR)

folder_b_count = 0
for folder_b in os.listdir(ROOT_DIR):
folder_b_path = os.path.join(ROOT_DIR, folder_b)
if os.path.isdir(folder_b_path):
process_folder_B(folder_b_path, HTML_DEST_DIR, IMAGES_DEST_DIR)
folder_b_count += 1
print(folder_b_count)
print(f"Processed {folder_b_count} 'foldername B' directories.")

if __name__ == "__main__":
main()
Loading

0 comments on commit aad4169

Please sign in to comment.