Switch off BeautifulSoup4 #3

vevv · 2023-02-13T23:48:04Z

BeautifulSoup4 is currently used for TTML subtitle conversion, and while it handles this job well, it's certainly not the fastest option.

MiM-MiM · 2023-03-04T08:28:34Z

This is a quick one that I wrote a bit ago that works well for TTML, fixing some issues in the process. Could be adopted to yours if you want ElementTree instead of beautiful soup. A quick test using the time command for 7 random ttml subs was 500ms. Beautiful soup I know can be slow with all the preprocessing it does, this should hopefully be faster.

time ./WebTTML.py *.ttml2
real    0m0.509s
user    0m0.469s
sys     0m0.047s

TTML is just XML with the order of them mattering, having the ability to use recursion. A lot of the vocab isn't used, just put there for completeness in case it is needed. I've tested it on dfxp too, which is essentially ttml apart from how timecodes work, this keeps the time correct on both.

#!/bin/python3
import xml.etree.ElementTree as ET
import datetime
import re
import html
from pathlib import Path
import os



def get_styles(tags, tag_type=None):
    styles = []
    if tags == None:
        return styles
    for style in list(tags):
        style_dict = get_style(style)
        style_tag = re.sub(r".*}", "", style.tag).lower()
        if tag_type == None or style_tag == tag_type.lower():
            styles.append(style_dict)
    return styles


def get_style(tag):
    style_dict = {}
    # https://www.w3.org/TR/ttml2/#styling-vocabulary-style
    vocab = {
        "id": "id",
        "style": "style",
        "region": "region",
        "begin": "begin",
        "end": "end",
        "backgroundcolor": "backgroundColor",
        "backgroundextent": "backgroundExtent",
        "backgroundimage": "backgroundImage",
        "backgroundorigin": "backgroundOrigin",
        "backgroundposition": "backgroundPosition",
        "backgroundrepeat": "backgroundRepeat",
        "border": "border",
        "bpd": "bpd",
        "color": "color",
        "disparity": "disparity",
        "display": "display",
        "displayalign": "displayAlign",
        "extent": "extent",
        "fontfamily": "fontFamily",
        "fontkerning": "fontKerning",
        "fontselectionstrategy": "fontSelectionStrategy",
        "fontshear": "fontShear",
        "fontsize": "fontSize",
        "fontstyle": "fontStyle",
        "fontvariant": "fontVariant",
        "fontweight": "fontWeight",
        "ipd": "ipd",
        "letterspacing": "letterSpacing",
        "lineheight": "lineHeight",
        "lineshear": "lineShear",
        "luminancegain": "luminanceGain",
        "opacity": "opacity",
        "origin": "origin",
        "overflow": "overflow",
        "padding": "padding",
        "position": "position",
        "ruby": "ruby",
        "rubyalign": "rubyAlign",
        "rubyposition": "rubyPosition",
        "rubyreserve": "rubyReserve",
        "shear": "shear",
        "showbackground": "showBackground",
        "textalign": "textAlign",
        "textcombine": "textCombine",
        "textdecoration": "textDecoration",
        "textemphasis": "textEmphasis",
        "textorientation": "textOrientation",
        "textoutline": "textOutline",
        "textshadow": "textShadow",
        "unicodebidi": "unicodeBidi",
        "visibility": "visibility",
        "wrapoption": "wrapOption",
        "writingmode": "writingMode",
        "zindex": "zIndex",
        "gain": "gain",
        "pan": "pan",
        "pitch": "pitch",
        "speak": "speak",
    }
    for attrib in tag.attrib:
        attribute = re.sub(r".*}", "", attrib).lower()
        if attribute in vocab.keys():
            style_dict[vocab[attribute]] = tag.get(attrib)
    return style_dict


def tag_style_attribute_converter(attribute={}, styles=[], attribute_type="style"):
    attribute_style = attribute.get(attribute_type)
    if not attribute_style:
        return attribute
    style_set = False
    for style in styles:
        style_id = style.get("id")
        if attribute_style == style_id:
            style_set = True
            attribute[attribute_type] = style
            break
    if not style_set:
        attribute[attribute_type] = {}
    return attribute


def tags_style_attribute_converter(attributes=[], styles=[], attribute_type="style"):
    return [
        tag_style_attribute_converter(attribute, styles, attribute_type)
        for attribute in attributes
    ]


def convert(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    root_children = list(root)
    for child in root_children:
        tag_name = re.sub(r".*}", "", child.tag).lower()
        if tag_name == "head":
            head_tag = child
        elif tag_name == "body":
            body_tag = child

    styles, regions = [], []
    styling_tag, region_tag = None, None

    for child in list(head_tag):
        if child.tag.lower().endswith("styling"):
            styling_tag = child
        elif child.tag.lower().endswith("layout"):
            region_tag = child
    styles = get_styles(styling_tag)
    regions = tags_style_attribute_converter(get_styles(region_tag), styles)
    div_tags = []
    for body in list(body_tag):
        tag_name = re.sub(r".*}", "", body.tag).lower()
        if tag_name == "div":
            div_tags.append(body)
    divs = []
    for div in div_tags:
        divs = divs + div_convert(div, styles, regions)
    return divs


def div_convert(div, styles=[], regions=[]):
    regions = tags_style_attribute_converter(regions, styles)
    div_style = tag_style_attribute_converter(get_style(div), styles)
    para_tags = []
    for elem in list(div):
        tag_name = re.sub(r".*}", "", elem.tag).lower()
        if tag_name == "p":
            para_tags.append(elem)
    return convert_paras(para_tags, styles, div_style, regions)


def convert_paras(paras, styles=[], div_style={}, regions=[]):
    para_dicts = []
    for para in paras:
        para_dicts.append(convert_para(para, styles, div_style, regions))
    return para_dicts


i = 1


def convert_para(para, styles=[], div_style={}, regions=[]):
    global i
    para_style = get_style(para)
    para_dict = tag_style_attribute_converter(para_style, styles)
    para_region = tag_style_attribute_converter(
        para_dict, regions, attribute_type="region"
    ).get("region", {})
    para_dict["sub"] = para_to_text_list(para, para_style.get("style", {}), styles)
    para_region.pop("sub", None)
    # para_dict = merge_styles(para_region, para_dict)
    extent = para_region.get("extent", "")
    origin = para_region.get("origin", "")
    displayAlign = para_region.get("displayAlign", "")
    textAlign = para_region.get("textAlign", "")
    para_dict["extent"] = para_dict.get("extent", extent)
    para_dict["origin"] = para_dict.get("origin", origin)
    para_dict["displayAlign"] = para_dict.get("displayAlign", displayAlign)
    para_dict["textAlign"] = para_dict.get("textAlign", textAlign)
    return para_dict


def para_to_text_list(para, para_style, styles):
    global fix_amazon
    para_text = []
    current_style = get_style(para)
    current_style = tag_style_attribute_converter(current_style, styles)
    current_style = current_style.get("style", {})
    para_style = merge_styles(current_style, para_style)
    tag_name = re.sub(r".*}", "", para.tag).lower()
    if para.text:
        para_text.append({"text": para.text, "style": para_style})
    if tag_name == "br":
        para_text.append({"text": f"\n", "style": {}})
    next_text = list(para)
    if len(next_text) > 0:
        for np in next_text:
            tag_name = re.sub(r".*}", "", np.tag).lower()
            np_style = tag_style_attribute_converter(get_style(np), styles)
            if fix_amazon:
                if tag_name == "span" and np_style == {}:
                    np_style = {"fontStyle": "italic"}
            np_style = merge_styles(np_style, para_style)
            np_text = para_to_text_list(np, np_style, styles)
            if type(np_text) is list:
                para_text = para_text + np_text
            else:
                para_text.append(np_text)
            if np.tail:
                para_text.append({"text": np.tail, "style": para_style})
    return para_text


def merge_styles(new, old):
    new_keys = new.keys()
    old_keys = old.keys()
    for ok in old_keys:
        if ok not in new_keys:
            new[ok] = old[ok]
    return new


def combine_subs(sub, use_color=False, use_font=False):
    global locations, exclude_colors
    sub_extent = sub.get("extent", "")
    sub_origin = sub.get("origin", "")
    sub_displayAlign = sub.get("displayAlign", "")
    sub_textAlign = sub.get("textAlign", "")
    sub_text = ""
    subs = sub.get("sub", [])
    disallowed_fonts = ["sansserif", "monospace"]
    for sub in subs:
        text = sub.get("text", "")
        style = sub.get("style", {}).get("fontStyle", "")
        fontWeight = sub.get("style", {}).get("fontWeight", "").lower()
        color = sub.get("style", {}).get("color", "")
        color = color if color not in exclude_colors else ""
        font = sub.get("style", {}).get("fontFamily", "")
        set_font, set_color = False, False
        if font != "" and font.lower() not in disallowed_fonts:
            set_font = True
        if color != "":
            set_color = True
        if (set_font and use_font) or (set_color and use_color):
            font_text = f"<font"
            if set_font and use_font:
                font_text = f'{font_text} face="{font}"'
            if set_color and use_color:
                font_text = f'{font_text} color="{color}"'
            font_text = f"{font_text}>"
            text = f"{font_text}{text}</font>"
        if fontWeight == "bold":
            text = f"<b>{text}</b>"
        if style == "italic":
            text = f"<i>{text}</i>"
        sub_text = f"{sub_text}{text}"
    # Unescape HTML characters
    sub_text = html.unescape(sub_text)
    # Fix cases where the italic/bold is missing spaces
    if fix_amazon:
        sub_text = re.sub(r"(</?[ib]*?>)([^ ])", r"\1 \2", sub_text, flags=re.M)
    # Fix extra space around tags
    sub_text = re.sub(r" *(</[^/>]+>) *", r"\1 ", sub_text, flags=re.M)
    sub_text = re.sub(r" *(<[^/>]+>) *", r" \1", sub_text, flags=re.M)
    sub_text = re.sub(r"> <", r"><", sub_text, flags=re.M)
    # Fix the extra space at the beginning/end
    sub_text = re.sub(r"^ *(<[^/>]+>)? *", r"\1", sub_text, flags=re.M)
    sub_text = re.sub(r" +?(</?[^/>]+?>)? +?$", r"\1", sub_text, flags=re.M)
    # When italic or bold tag is closed and open on the next line remove the unnessary tag
    sub_text = re.sub(r"</([ib]+?>)\n<\1", r"\n", sub_text, flags=re.M)
    sub_text = re.sub(r"^(<[ib]>)(-+)([^- ])", r"\2 \1\3", sub_text, flags=re.M)
    # Dialogue spacing edit.
    sub_text = re.sub(r"^(-+)([^- ])", r"\1 \2", sub_text, flags=re.M)
    # Allow subs to start with any number of '>' before a tag
    sub_text = re.sub(r"(^[^<>]*)(>+)", r"\1" + "&gt;" * len(r"\2"), sub_text, flags=re.M)
    if sub_displayAlign == "before" and locations:
        sub_text = r"{\an8}" + sub_text
    return sub_text


fix_amazon = True
locations = True

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="The Timed Text Markup Language (TTML) converter."
    )
    parser.add_argument(
        "file_list",
        type=str,
        metavar="subtitles",
        nargs="+",
        help="file or directory containing the release",
    )
    parser.add_argument(
        "--color",
        dest="color",
        default=False,
        action="store_true",
        help="The color will be copied.",
    )
    parser.add_argument(
        "--exclude-colors",
        dest="exclude_colors",
        type=str,
        default='',
        help="Colors to exclude, comma delimited.",
    )
    parser.add_argument(
        "--font",
        dest="font",
        default=False,
        action="store_true",
        help="The font will be copied.",
    )
    parser.add_argument(
        "--no-amazon-fix",
        dest="no_amazon_fix",
        default=False,
        action="store_true",
        help="Disable the amazon empty span fix.",
    )
    parser.add_argument(
        "--no-locations",
        dest="no_locations",
        default=False,
        action="store_true",
        help="Don't put locations.",
    )
    args = parser.parse_args()
    fix_amazon = not args.no_amazon_fix
    locations = not args.no_locations
    exclude_colors = [
        c.lower().strip()
        for c in args.exclude_colors.split(',')
    ]
    for filename in args.file_list:
        json_sub = convert(filename)
        base_name = os.path.basename(filename)
        file = os.path.splitext(base_name)[0]
        new_filename = f"{file}.srt"
        save_folder = Path(os.path.abspath(filename)).parent.as_posix()
        new_path = os.path.join(save_folder, new_filename)
        f = open(new_path, "w")
        srt = ""
        sub_number = 1
        for sub in json_sub:
            start = sub.get("begin", "")
            start_time = start.replace(".", ",")
            end = sub.get("end", "")
            end_time = end.replace(".", ",")
            if start.endswith("t"):
                # Netflix style, converted to microseconds
                start = round(float(start[:-1]) / 10)
                end = round(float(end[:-1]) / 10)
                start_time = str(datetime.timedelta(microseconds=start)).split(":")
                end_time = str(datetime.timedelta(microseconds=end)).split(":")
                start_time = [
                    start_time[0].zfill(2),
                    start_time[1].zfill(2),
                    "{:.3f}".format(round(float(start_time[2]), 3)).replace(".", ","),
                ]
                end_time = [
                    end_time[0].zfill(2),
                    end_time[1].zfill(2),
                    "{:.3f}".format(round(float(end_time[2]), 3)).replace(".", ","),
                ]
                start_time = ":".join(start_time)
                end_time = ":".join(end_time)
            sub_text = combine_subs(sub, use_color=args.color, use_font=args.font)
            srt = srt + f"{sub_number}\n{start_time} --> {end_time}\n{sub_text}\n\n"
            sub_number = sub_number + 1
        srt = f"{srt}"
        f.write(srt)
        f.close()

vevv · 2023-03-05T17:01:03Z

Thanks! I'll have a look later and try to adapt it here, unless you're planning on doing that and sending a PR.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Switch off BeautifulSoup4 #3

Switch off BeautifulSoup4 #3

vevv commented Feb 13, 2023

MiM-MiM commented Mar 4, 2023

vevv commented Mar 5, 2023

Switch off BeautifulSoup4 #3

Switch off BeautifulSoup4 #3

Comments

vevv commented Feb 13, 2023

MiM-MiM commented Mar 4, 2023

vevv commented Mar 5, 2023