Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: converted all htr team data into jsonl & xml format for line segmentation #10

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 0 additions & 100 deletions src/format_line_segmentations/checkpoint.txt

This file was deleted.

29 changes: 15 additions & 14 deletions src/format_line_segmentations/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ def process_google_books_html_files(paths):

# Process XML files for HTR team data
def process_htr_teams_xml_files(paths):
image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_xml"], f)
for f in os.listdir(paths["aws"]["input_xml"]) if f.lower().endswith(".jpg")}
image_files_xml = {os.path.splitext(f)[0]: os.path.join(paths["aws"]["input_images"], f)
for f in os.listdir(paths["aws"]["input_images"]) if f.lower().endswith(".jpg")}
with open(paths["aws"]["output_jsonl"], 'w', encoding='utf-8') as output_1:
for filename in os.listdir(paths["aws"]["input_xml"]):
if filename.endswith(".xml"):
Expand All @@ -153,11 +153,10 @@ def process_htr_teams_xml_files(paths):
file_path = os.path.join(paths["aws"]["input_xml"], filename)
ocr_data = process_xml_file(file_path)
image_metadata_1 = extract_metadata_from_xml(ocr_data, image_file_1)
if image_metadata_1:
if ocr_data and image_metadata_1:
jsonl_output = convert_to_jsonl(ocr_data, image_metadata_1)
output_1.write(jsonl_output + '\n')
xml_root = convert_to_xml(ocr_data, image_metadata_1, "AWS Data",
"2024-06-10T11:08:30.326+00:00")
xml_root = convert_to_xml(ocr_data, image_metadata_1, "HTR Team")
xml_output = prettify_xml(xml_root)
output_file_path = os.path.join(paths["aws"]["output_xml"], f"{file_id}.xml")
with open(output_file_path, 'w', encoding='utf-8') as output_file_aws:
Expand All @@ -168,17 +167,18 @@ def main():
base_path = '../../data/line_segmentation_inputs/'
output_base_path = '../../data/line_segmentation_output_format/'
paths = {
"google_books": {
"aws": {
"input_xml": f"{base_path}htr_teams/htr_team_xml_folder",
"input_images": f"{base_path}htr_teams/htr_team_images_folder/",
"output_jsonl": f"{output_base_path}htr_team_data.jsonl",
"output_xml": f"{output_base_path}htr_teams_data_xml/"
},
"google_books": {
"input_html": f"{base_path}google_books/google_books_html_folder/",
"input_images": f"{base_path}google_books/google_books_images_folder/",
"output_jsonl": f"{output_base_path}google_books_data.jsonl",
"output_xml": f"{output_base_path}google_books_data_xml/"
},
"aws": {
"input_xml": f"{base_path}htr_team/",
"output_jsonl": f"{output_base_path}htr_team_data.jsonl",
"output_xml": f"{output_base_path}htr_teams_data_xml/"
},
"transkribus": {
"stok_kangyur": {
"input_xml_base": f"{base_path}transkrisbus/stok_kangyur/"
Expand All @@ -192,9 +192,10 @@ def main():
"tib_school": {
"input_xml_base": f"{base_path}transkrisbus/tib_school/"
}
}
}
}
}
create_directories(paths)
# Process Html files for Google Books data
process_google_books_html_files(paths)
transkribus_datasets = {
"Transkribus Stok Kangyur": paths["transkribus"]["stok_kangyur"]["input_xml_base"],
Expand All @@ -204,7 +205,7 @@ def main():
}
for dataset_name, input_xml_base in transkribus_datasets.items():
input_xml, output_jsonl, output_xml = get_xml_paths(input_xml_base, output_base_path)
process_xml_files(input_xml, output_jsonl, output_xml, dataset_name)
process_xml_files(input_xml, output_jsonl, output_xml, dataset_name) # Process XML files for Transkribus data
# Process XML files for HTR team data
process_htr_teams_xml_files(paths)

Expand Down
53 changes: 53 additions & 0 deletions src/format_line_segmentations/formatting_htr_team_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import shutil
from PIL import Image

ROOT_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_team_data'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

provide only the relative path not the absolute path

XML_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_xml_folder'
IMAGES_DEST_DIR = '/Users/ogyenthoga/Desktop/Work/Formatting_line_segmentation/data/htr_teams/htr_team_images_folder'

def convert_to_jpg(src_path, dest_path):
with Image.open(src_path) as img:
img.convert('RGB').save(dest_path, 'JPEG')

def copy_files(src_dir, dest_xml_dir, dest_images_dir, prefix=''):
xml_files = []
image_files = []
for root, dirs, files in os.walk(src_dir):
for file in files:
if file.lower().endswith('.xml'):
dest_path = os.path.join(dest_xml_dir, f"{prefix}_{file}")
shutil.copy2(os.path.join(root, file), dest_path)
xml_files.append(dest_path)
elif file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg', '.png', '.gif')):
dest_path = os.path.join(dest_images_dir, f"{prefix}_{os.path.splitext(file)[0]}.jpg")
convert_to_jpg(os.path.join(root, file), dest_path)
image_files.append(dest_path)
return xml_files, image_files

def process_folder_B(folder_b_path, dest_xml_dir, dest_images_dir):
folder_b_name = os.path.basename(folder_b_path)
xml_files, image_files = copy_files(folder_b_path, dest_xml_dir, dest_images_dir, prefix=folder_b_name)
xml_count = len(xml_files)
image_count = len(image_files)
print(f"Folder {folder_b_path} - XML files: {xml_count}, Image files: {image_count}")
if xml_count != image_count:
print(f"WARNING: Folder {folder_b_path} has {xml_count} XML files and {image_count} image files.")

def main():
if not os.path.exists(XML_DEST_DIR):
os.makedirs(XML_DEST_DIR)
if not os.path.exists(IMAGES_DEST_DIR):
os.makedirs(IMAGES_DEST_DIR)

folder_b_count = 0
for folder_b in os.listdir(ROOT_DIR):
folder_b_path = os.path.join(ROOT_DIR, folder_b)
if os.path.isdir(folder_b_path):
process_folder_B(folder_b_path, XML_DEST_DIR, IMAGES_DEST_DIR)
folder_b_count += 1
print(f"Processed {folder_b_count} Folder B directories.")
print(f"Total processed 'Folder B' directories: {folder_b_count}")

if __name__ == "__main__":
main()
12 changes: 0 additions & 12 deletions src/format_line_segmentations/google_drive_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

# The ID of the Google Drive folder from which to download ZIP files.
FOLDER_ID = "15Y-PnZBT1JtrZX1ck-RT4Hd1oWU9VA7b"


# Local directory to save the downloaded ZIP files.
DOWNLOAD_PATH = "../../data/google_books_zip/"

Expand All @@ -20,27 +18,22 @@ def authenticate_google_drive():
token_pickle = "../../data/token.pickle"
credentials_file = "../../data/drive_cred.json"
scopes = ["https://www.googleapis.com/auth/drive.readonly"]

if os.path.exists(token_pickle):
with open(token_pickle, "rb") as token:
creds = pickle.load(token)

# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(credentials_file, scopes)
creds = flow.run_local_server()

# Save the credentials for the next run
with open(token_pickle, "wb") as token:
pickle.dump(creds, token)

service = build("drive", "v3", credentials=creds)
return service


def list_zip_files(service, folder_id):
"""List all ZIP files in the specified Google Drive folder."""
query = f"'{folder_id}' in parents and mimeType='application/zip'"
Expand All @@ -51,7 +44,6 @@ def list_zip_files(service, folder_id):
)
return results.get("files", [])


def download_file(service, file_id, file_name, download_path):
"""Download a file from Google Drive."""
request = service.files().get_media(fileId=file_id)
Expand All @@ -64,17 +56,14 @@ def download_file(service, file_id, file_name, download_path):
print(f"Downloaded {file_name} {int(status.progress() * 100)}%.")

"""check point system"""

CONVERT_CHECKPOINT = Path("checkpoint.txt")

def load_checkpoints():
if CONVERT_CHECKPOINT.exists():
return CONVERT_CHECKPOINT.read_text().splitlines()

CONVERT_CHECKPOINT.touch()
return []


def save_checkpoint(file_checkpoint: Path):
with open(CONVERT_CHECKPOINT, "a") as f:
f.write(f"{str(file_checkpoint)}\n")
Expand All @@ -92,6 +81,5 @@ def main():
download_file(service, file["id"], file["name"], DOWNLOAD_PATH)
save_checkpoint(file["name"])


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion src/format_line_segmentations/xml_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def process_xml_file(file_path):
print(f"Error processing {file_path}: {e}")
return []

#Extract metadata from parsed OCR data and the image file.
#Extract metadata from parsed OCR data and the image file.
def extract_metadata_from_xml(ocr_data, image_file):
metadata = {}
metadata['id'] = os.path.splitext(os.path.basename(image_file))[0] + ".jpg"
Expand Down
Loading