-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Switch off BeautifulSoup4 #3
Comments
This is a quick one that I wrote a bit ago that works well for TTML, fixing some issues in the process. Could be adopted to yours if you want ElementTree instead of beautiful soup. A quick test using the time command for 7 random ttml subs was 500ms. Beautiful soup I know can be slow with all the preprocessing it does, this should hopefully be faster. time ./WebTTML.py *.ttml2
real 0m0.509s
user 0m0.469s
sys 0m0.047s TTML is just XML with the order of them mattering, having the ability to use recursion. A lot of the vocab isn't used, just put there for completeness in case it is needed. I've tested it on dfxp too, which is essentially ttml apart from how timecodes work, this keeps the time correct on both. #!/bin/python3
import xml.etree.ElementTree as ET
import datetime
import re
import html
from pathlib import Path
import os
def get_styles(tags, tag_type=None):
styles = []
if tags == None:
return styles
for style in list(tags):
style_dict = get_style(style)
style_tag = re.sub(r".*}", "", style.tag).lower()
if tag_type == None or style_tag == tag_type.lower():
styles.append(style_dict)
return styles
def get_style(tag):
style_dict = {}
# https://www.w3.org/TR/ttml2/#styling-vocabulary-style
vocab = {
"id": "id",
"style": "style",
"region": "region",
"begin": "begin",
"end": "end",
"backgroundcolor": "backgroundColor",
"backgroundextent": "backgroundExtent",
"backgroundimage": "backgroundImage",
"backgroundorigin": "backgroundOrigin",
"backgroundposition": "backgroundPosition",
"backgroundrepeat": "backgroundRepeat",
"border": "border",
"bpd": "bpd",
"color": "color",
"disparity": "disparity",
"display": "display",
"displayalign": "displayAlign",
"extent": "extent",
"fontfamily": "fontFamily",
"fontkerning": "fontKerning",
"fontselectionstrategy": "fontSelectionStrategy",
"fontshear": "fontShear",
"fontsize": "fontSize",
"fontstyle": "fontStyle",
"fontvariant": "fontVariant",
"fontweight": "fontWeight",
"ipd": "ipd",
"letterspacing": "letterSpacing",
"lineheight": "lineHeight",
"lineshear": "lineShear",
"luminancegain": "luminanceGain",
"opacity": "opacity",
"origin": "origin",
"overflow": "overflow",
"padding": "padding",
"position": "position",
"ruby": "ruby",
"rubyalign": "rubyAlign",
"rubyposition": "rubyPosition",
"rubyreserve": "rubyReserve",
"shear": "shear",
"showbackground": "showBackground",
"textalign": "textAlign",
"textcombine": "textCombine",
"textdecoration": "textDecoration",
"textemphasis": "textEmphasis",
"textorientation": "textOrientation",
"textoutline": "textOutline",
"textshadow": "textShadow",
"unicodebidi": "unicodeBidi",
"visibility": "visibility",
"wrapoption": "wrapOption",
"writingmode": "writingMode",
"zindex": "zIndex",
"gain": "gain",
"pan": "pan",
"pitch": "pitch",
"speak": "speak",
}
for attrib in tag.attrib:
attribute = re.sub(r".*}", "", attrib).lower()
if attribute in vocab.keys():
style_dict[vocab[attribute]] = tag.get(attrib)
return style_dict
def tag_style_attribute_converter(attribute={}, styles=[], attribute_type="style"):
attribute_style = attribute.get(attribute_type)
if not attribute_style:
return attribute
style_set = False
for style in styles:
style_id = style.get("id")
if attribute_style == style_id:
style_set = True
attribute[attribute_type] = style
break
if not style_set:
attribute[attribute_type] = {}
return attribute
def tags_style_attribute_converter(attributes=[], styles=[], attribute_type="style"):
return [
tag_style_attribute_converter(attribute, styles, attribute_type)
for attribute in attributes
]
def convert(filename):
tree = ET.parse(filename)
root = tree.getroot()
root_children = list(root)
for child in root_children:
tag_name = re.sub(r".*}", "", child.tag).lower()
if tag_name == "head":
head_tag = child
elif tag_name == "body":
body_tag = child
styles, regions = [], []
styling_tag, region_tag = None, None
for child in list(head_tag):
if child.tag.lower().endswith("styling"):
styling_tag = child
elif child.tag.lower().endswith("layout"):
region_tag = child
styles = get_styles(styling_tag)
regions = tags_style_attribute_converter(get_styles(region_tag), styles)
div_tags = []
for body in list(body_tag):
tag_name = re.sub(r".*}", "", body.tag).lower()
if tag_name == "div":
div_tags.append(body)
divs = []
for div in div_tags:
divs = divs + div_convert(div, styles, regions)
return divs
def div_convert(div, styles=[], regions=[]):
regions = tags_style_attribute_converter(regions, styles)
div_style = tag_style_attribute_converter(get_style(div), styles)
para_tags = []
for elem in list(div):
tag_name = re.sub(r".*}", "", elem.tag).lower()
if tag_name == "p":
para_tags.append(elem)
return convert_paras(para_tags, styles, div_style, regions)
def convert_paras(paras, styles=[], div_style={}, regions=[]):
para_dicts = []
for para in paras:
para_dicts.append(convert_para(para, styles, div_style, regions))
return para_dicts
i = 1
def convert_para(para, styles=[], div_style={}, regions=[]):
global i
para_style = get_style(para)
para_dict = tag_style_attribute_converter(para_style, styles)
para_region = tag_style_attribute_converter(
para_dict, regions, attribute_type="region"
).get("region", {})
para_dict["sub"] = para_to_text_list(para, para_style.get("style", {}), styles)
para_region.pop("sub", None)
# para_dict = merge_styles(para_region, para_dict)
extent = para_region.get("extent", "")
origin = para_region.get("origin", "")
displayAlign = para_region.get("displayAlign", "")
textAlign = para_region.get("textAlign", "")
para_dict["extent"] = para_dict.get("extent", extent)
para_dict["origin"] = para_dict.get("origin", origin)
para_dict["displayAlign"] = para_dict.get("displayAlign", displayAlign)
para_dict["textAlign"] = para_dict.get("textAlign", textAlign)
return para_dict
def para_to_text_list(para, para_style, styles):
global fix_amazon
para_text = []
current_style = get_style(para)
current_style = tag_style_attribute_converter(current_style, styles)
current_style = current_style.get("style", {})
para_style = merge_styles(current_style, para_style)
tag_name = re.sub(r".*}", "", para.tag).lower()
if para.text:
para_text.append({"text": para.text, "style": para_style})
if tag_name == "br":
para_text.append({"text": f"\n", "style": {}})
next_text = list(para)
if len(next_text) > 0:
for np in next_text:
tag_name = re.sub(r".*}", "", np.tag).lower()
np_style = tag_style_attribute_converter(get_style(np), styles)
if fix_amazon:
if tag_name == "span" and np_style == {}:
np_style = {"fontStyle": "italic"}
np_style = merge_styles(np_style, para_style)
np_text = para_to_text_list(np, np_style, styles)
if type(np_text) is list:
para_text = para_text + np_text
else:
para_text.append(np_text)
if np.tail:
para_text.append({"text": np.tail, "style": para_style})
return para_text
def merge_styles(new, old):
new_keys = new.keys()
old_keys = old.keys()
for ok in old_keys:
if ok not in new_keys:
new[ok] = old[ok]
return new
def combine_subs(sub, use_color=False, use_font=False):
global locations, exclude_colors
sub_extent = sub.get("extent", "")
sub_origin = sub.get("origin", "")
sub_displayAlign = sub.get("displayAlign", "")
sub_textAlign = sub.get("textAlign", "")
sub_text = ""
subs = sub.get("sub", [])
disallowed_fonts = ["sansserif", "monospace"]
for sub in subs:
text = sub.get("text", "")
style = sub.get("style", {}).get("fontStyle", "")
fontWeight = sub.get("style", {}).get("fontWeight", "").lower()
color = sub.get("style", {}).get("color", "")
color = color if color not in exclude_colors else ""
font = sub.get("style", {}).get("fontFamily", "")
set_font, set_color = False, False
if font != "" and font.lower() not in disallowed_fonts:
set_font = True
if color != "":
set_color = True
if (set_font and use_font) or (set_color and use_color):
font_text = f"<font"
if set_font and use_font:
font_text = f'{font_text} face="{font}"'
if set_color and use_color:
font_text = f'{font_text} color="{color}"'
font_text = f"{font_text}>"
text = f"{font_text}{text}</font>"
if fontWeight == "bold":
text = f"<b>{text}</b>"
if style == "italic":
text = f"<i>{text}</i>"
sub_text = f"{sub_text}{text}"
# Unescape HTML characters
sub_text = html.unescape(sub_text)
# Fix cases where the italic/bold is missing spaces
if fix_amazon:
sub_text = re.sub(r"(</?[ib]*?>)([^ ])", r"\1 \2", sub_text, flags=re.M)
# Fix extra space around tags
sub_text = re.sub(r" *(</[^/>]+>) *", r"\1 ", sub_text, flags=re.M)
sub_text = re.sub(r" *(<[^/>]+>) *", r" \1", sub_text, flags=re.M)
sub_text = re.sub(r"> <", r"><", sub_text, flags=re.M)
# Fix the extra space at the beginning/end
sub_text = re.sub(r"^ *(<[^/>]+>)? *", r"\1", sub_text, flags=re.M)
sub_text = re.sub(r" +?(</?[^/>]+?>)? +?$", r"\1", sub_text, flags=re.M)
# When italic or bold tag is closed and open on the next line remove the unnessary tag
sub_text = re.sub(r"</([ib]+?>)\n<\1", r"\n", sub_text, flags=re.M)
sub_text = re.sub(r"^(<[ib]>)(-+)([^- ])", r"\2 \1\3", sub_text, flags=re.M)
# Dialogue spacing edit.
sub_text = re.sub(r"^(-+)([^- ])", r"\1 \2", sub_text, flags=re.M)
# Allow subs to start with any number of '>' before a tag
sub_text = re.sub(r"(^[^<>]*)(>+)", r"\1" + ">" * len(r"\2"), sub_text, flags=re.M)
if sub_displayAlign == "before" and locations:
sub_text = r"{\an8}" + sub_text
return sub_text
fix_amazon = True
locations = True
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="The Timed Text Markup Language (TTML) converter."
)
parser.add_argument(
"file_list",
type=str,
metavar="subtitles",
nargs="+",
help="file or directory containing the release",
)
parser.add_argument(
"--color",
dest="color",
default=False,
action="store_true",
help="The color will be copied.",
)
parser.add_argument(
"--exclude-colors",
dest="exclude_colors",
type=str,
default='',
help="Colors to exclude, comma delimited.",
)
parser.add_argument(
"--font",
dest="font",
default=False,
action="store_true",
help="The font will be copied.",
)
parser.add_argument(
"--no-amazon-fix",
dest="no_amazon_fix",
default=False,
action="store_true",
help="Disable the amazon empty span fix.",
)
parser.add_argument(
"--no-locations",
dest="no_locations",
default=False,
action="store_true",
help="Don't put locations.",
)
args = parser.parse_args()
fix_amazon = not args.no_amazon_fix
locations = not args.no_locations
exclude_colors = [
c.lower().strip()
for c in args.exclude_colors.split(',')
]
for filename in args.file_list:
json_sub = convert(filename)
base_name = os.path.basename(filename)
file = os.path.splitext(base_name)[0]
new_filename = f"{file}.srt"
save_folder = Path(os.path.abspath(filename)).parent.as_posix()
new_path = os.path.join(save_folder, new_filename)
f = open(new_path, "w")
srt = ""
sub_number = 1
for sub in json_sub:
start = sub.get("begin", "")
start_time = start.replace(".", ",")
end = sub.get("end", "")
end_time = end.replace(".", ",")
if start.endswith("t"):
# Netflix style, converted to microseconds
start = round(float(start[:-1]) / 10)
end = round(float(end[:-1]) / 10)
start_time = str(datetime.timedelta(microseconds=start)).split(":")
end_time = str(datetime.timedelta(microseconds=end)).split(":")
start_time = [
start_time[0].zfill(2),
start_time[1].zfill(2),
"{:.3f}".format(round(float(start_time[2]), 3)).replace(".", ","),
]
end_time = [
end_time[0].zfill(2),
end_time[1].zfill(2),
"{:.3f}".format(round(float(end_time[2]), 3)).replace(".", ","),
]
start_time = ":".join(start_time)
end_time = ":".join(end_time)
sub_text = combine_subs(sub, use_color=args.color, use_font=args.font)
srt = srt + f"{sub_number}\n{start_time} --> {end_time}\n{sub_text}\n\n"
sub_number = sub_number + 1
srt = f"{srt}"
f.write(srt)
f.close() |
Thanks! I'll have a look later and try to adapt it here, unless you're planning on doing that and sending a PR. |
BeautifulSoup4 is currently used for TTML subtitle conversion, and while it handles this job well, it's certainly not the fastest option.
The text was updated successfully, but these errors were encountered: