From 045dc3fe4258eb8be572037676d4864d3c5a15db Mon Sep 17 00:00:00 2001 From: dsactionengine <128065385+dsavinov-actionengine@users.noreply.github.com> Date: Fri, 17 May 2024 19:56:33 +0200 Subject: [PATCH 1/3] Adding support for VTT format --- bin/create_files.py | 3 +- docker/Dockerfile | 1 + openformats/formats/vtt.py | 174 ++++++++++++++++++ openformats/tests/formats/vtt/__init__.py | 0 openformats/tests/formats/vtt/files/1_el.vtt | 35 ++++ openformats/tests/formats/vtt/files/1_en.vtt | 35 ++++ openformats/tests/formats/vtt/files/1_tpl.vtt | 32 ++++ openformats/tests/formats/vtt/test_vtt.py | 120 ++++++++++++ openformats/tests/utils/dictionary.py | 2 +- requirements.txt | 2 + 10 files changed, 402 insertions(+), 2 deletions(-) create mode 100644 openformats/formats/vtt.py create mode 100644 openformats/tests/formats/vtt/__init__.py create mode 100644 openformats/tests/formats/vtt/files/1_el.vtt create mode 100644 openformats/tests/formats/vtt/files/1_en.vtt create mode 100644 openformats/tests/formats/vtt/files/1_tpl.vtt create mode 100644 openformats/tests/formats/vtt/test_vtt.py diff --git a/bin/create_files.py b/bin/create_files.py index c26d9236..4114513f 100755 --- a/bin/create_files.py +++ b/bin/create_files.py @@ -15,7 +15,7 @@ from io import open from openformats.formats import (android, github_markdown_v2, json, plaintext, - po, srt) + po, srt, vtt) from openformats.tests.utils import translate_stringset sys.path.append(os.path.join(os.path.dirname(__file__), "..")) @@ -29,6 +29,7 @@ def get_handler(ext): return { 'txt': plaintext.PlaintextHandler(), 'srt': srt.SrtHandler(), + 'vtt': vtt.VttHandler(), 'xml': android.AndroidHandler(), 'json': json.JsonHandler(), 'po': po.PoHandler(), diff --git a/docker/Dockerfile b/docker/Dockerfile index 1bf56f7a..d7e9919f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,6 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends $PKGS && \ ENV PYTHONDONTWRITEBYTECODE=1 COPY requirements.txt /requirements.txt +RUN pip install --upgrade pip RUN pip install -r /requirements.txt WORKDIR /app diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py new file mode 100644 index 00000000..8e457de3 --- /dev/null +++ b/openformats/formats/vtt.py @@ -0,0 +1,174 @@ +import re + +from ..handlers import Handler +from openformats.exceptions import ParseError +from openformats.strings import OpenString +from openformats.transcribers import Transcriber + + +class VttHandler(Handler): + name = "VTT" + extension = "vtt" + EXTRACTS_RAW = False + + NON_SPACE_PAT = re.compile(r'[^\s]') + + def _generate_split_subtitles(self, content, **kwargs): + start = 0 + for section in content.split('\n\n'): # sections are separated by blank lines + # find first non-space character of section + match = self.NON_SPACE_PAT.search(section) + if match: + yield start + match.start(), section.strip() + start += len(section) + 2 + + def parse(self, content): + self.transcriber = Transcriber(content) + source = self.transcriber.source + stringset = [] + for start, subtitle_section in self._generate_split_subtitles(source): + self.transcriber.copy_until(start) + offset, string = self._parse_section(start, subtitle_section) + + if string: + stringset.append(string) + + self.transcriber.copy_until(offset) + self.transcriber.add(string.template_replacement) + self.transcriber.skip(len(string.string)) + else: + self.transcriber.copy_until(start + len(subtitle_section)) + + self.transcriber.copy_until(len(source)) + + template = self.transcriber.get_destination() + if not template.startswith('WEBVTT'): + raise ParseError("VTT file should start with 'WEBVTT'!") + return template, stringset + + def _parse_section(self, offset, section): + src_strings = section.split('\n') # identifier_str is optional in VTT + + timings = "" + timings_index = -1 + for i in range(len(src_strings)): + str = src_strings[i]; + if "-->" in str: + timings = str + timings_index = i + + if timings_index < 0: + return None, None + + # Identifier (lines preceding the line with timings) is optional in VTT. + # Identifier can be either numberic or textual, and it is not necessarily unique. + identifier = '\n'.join(src_strings[:timings_index]) + + # timings + timings_parse_error = False + try: + splitted = timings.split(None, 3) + if len(splitted) == 3: + start, arrow, end = splitted + else: + start, arrow, end, _ = splitted + except ValueError: + timings_parse_error = True + else: + if arrow != "-->": + timings_parse_error = True + if timings_parse_error: + raise ParseError( + f"Timings on line {self.transcriber.line_number + 1} " + "don't follow '[start] --> [end] (position)' pattern" + ) + try: + start = self._format_timing(start) + except ValueError: + raise ParseError( + f"Problem with start of timing at line {self.transcriber.line_number + 1}: '{start}'" + ) + try: + end = self._format_timing(end) + except ValueError: + raise ParseError( + f"Problem with end of timing at line {self.transcriber.line_number + 1}: '{end}'" + ) + + # Content + string_to_translate = '\n'.join(src_strings[timings_index+1:]) + if string_to_translate == "": + raise ParseError(f"Subtitle is empty on line {self.transcriber.line_number + 2}") + + string = OpenString(timings, string_to_translate, + occurrences=f"{start},{end}") + offset += len(identifier) + len(timings) + 1; + if len(identifier): + offset += 1 + return offset, string + + def _format_timing(self, timing): + try: + rest, milliseconds = timing.split('.') + milliseconds = f"{milliseconds:<03}" + except ValueError: + rest, milliseconds = timing, "000" + # timing may or may not contain hours part + if rest.count(':') == 1: + minutes, seconds = rest.split(':') + minutes, seconds, milliseconds = (int(minutes), + int(seconds), + int(milliseconds)) + return f"{minutes:02}:{seconds:02}.{milliseconds:03}" + elif rest.count(':') == 2: + hours, minutes, seconds = rest.split(':') + hours, minutes, seconds, milliseconds = (int(hours), + int(minutes), + int(seconds), + int(milliseconds)) + return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}" + else: + raise ParseError(f"Unexpected timing format on line {self.transcriber.line_number + 2}") + + def compile(self, template, stringset, **kwargs): + transcriber = Transcriber(template) + template = transcriber.source + stringset = iter(stringset) + string = next(stringset) + + for start, subtitle_section in self._generate_split_subtitles(template): + transcriber.copy_until(start) + transcriber.mark_section_start() + + # Find hash after timings + hash_position = -1 + if subtitle_section.count('-->') > 0: + arrow_pos = subtitle_section.index('-->') + end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->')) + hash_position = end_of_timings + 1 + + if hash_position < 0: + transcriber.copy_until(start + len(subtitle_section)) + transcriber.mark_section_end() + elif (subtitle_section[ + hash_position: + hash_position + len(string.template_replacement) + ] == string.template_replacement): + # found it + transcriber.copy_until(start + hash_position) + transcriber.add(string.string) + transcriber.skip(len(string.template_replacement)) + transcriber.copy_until(start + len(subtitle_section)) + transcriber.mark_section_end() + try: + string = next(stringset) + except StopIteration: + pass + else: + # did not find it, must remove section + transcriber.copy_until(start + len(subtitle_section)) + transcriber.mark_section_end() + transcriber.remove_section() + + transcriber.copy_until(len(template)) + return transcriber.get_destination() diff --git a/openformats/tests/formats/vtt/__init__.py b/openformats/tests/formats/vtt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openformats/tests/formats/vtt/files/1_el.vtt b/openformats/tests/formats/vtt/files/1_el.vtt new file mode 100644 index 00000000..67f93a3e --- /dev/null +++ b/openformats/tests/formats/vtt/files/1_el.vtt @@ -0,0 +1,35 @@ +WEBVTT + +STYLE here +some long, +long style + +1 +00:01:28.797 --> 00:01:30.297 +Γεια σου, Κόσμε! + +NOTE some note +here + +2 +00:01:45.105 --> 00:01:47.940 X:350 Y:240 +Pinky: Brain, ρι θες να κάνουμε απόψε; +Brain: Ό,τι και κάθε βράδυ, Pinky: θα κατακτήσουμε τον κόσμο! + +3 +00:02:45.105 --> 00:02:47.940 +el:A phrase with escaped <HTML tags> + +4 +00:03:45.105 --> 00:03:47.940 +el:A phrase with HTML characters + +5 +00:05:45.105 --> 00:05:47.940 +el:A phrase with unicode characters: ΑβΓδΕ → ♡ Ш + +6 +00:06:45.105 --> 00:06:47.940 +el:Three lines: First +Second +Third diff --git a/openformats/tests/formats/vtt/files/1_en.vtt b/openformats/tests/formats/vtt/files/1_en.vtt new file mode 100644 index 00000000..2201aa1a --- /dev/null +++ b/openformats/tests/formats/vtt/files/1_en.vtt @@ -0,0 +1,35 @@ +WEBVTT + +STYLE here +some long, +long style + +1 +00:01:28.797 --> 00:01:30.297 +Hello, World! + +NOTE some note +here + +2 +00:01:45.105 --> 00:01:47.940 X:350 Y:240 +Pinky: Gee, Brain, what do you want to do tonight? +Brain: The same thing we do every night, Pinky - try to take over the world! + +3 +00:02:45.105 --> 00:02:47.940 +A phrase with escaped <HTML tags> + +4 +00:03:45.105 --> 00:03:47.940 +A phrase with HTML characters + +5 +00:05:45.105 --> 00:05:47.940 +A phrase with unicode characters: ΑβΓδΕ → ♡ Ш + +6 +00:06:45.105 --> 00:06:47.940 +Three lines: First +Second +Third diff --git a/openformats/tests/formats/vtt/files/1_tpl.vtt b/openformats/tests/formats/vtt/files/1_tpl.vtt new file mode 100644 index 00000000..6485f94a --- /dev/null +++ b/openformats/tests/formats/vtt/files/1_tpl.vtt @@ -0,0 +1,32 @@ +WEBVTT + +STYLE here +some long, +long style + +1 +00:01:28.797 --> 00:01:30.297 +c386a46eaaa5ecd18e760683c3e36987_tr + +NOTE some note +here + +2 +00:01:45.105 --> 00:01:47.940 X:350 Y:240 +f3736d657f04cedbb1eefd07e7fb4e53_tr + +3 +00:02:45.105 --> 00:02:47.940 +12a3c29d1c2ead6744096c2bcf5cb5a0_tr + +4 +00:03:45.105 --> 00:03:47.940 +32189023ec2e2af1c96ff6e50889a8e5_tr + +5 +00:05:45.105 --> 00:05:47.940 +df27c645bb92280c825e3e1c94a3f0b8_tr + +6 +00:06:45.105 --> 00:06:47.940 +22394ab09ce61d63e1f9d56ef64c4e40_tr diff --git a/openformats/tests/formats/vtt/test_vtt.py b/openformats/tests/formats/vtt/test_vtt.py new file mode 100644 index 00000000..f5b35ba0 --- /dev/null +++ b/openformats/tests/formats/vtt/test_vtt.py @@ -0,0 +1,120 @@ +import unittest + +from openformats.tests.formats.common import CommonFormatTestMixin +from openformats.tests.utils import strip_leading_spaces +from openformats.formats.vtt import VttHandler + + +class VttTestCase(CommonFormatTestMixin, unittest.TestCase): + HANDLER_CLASS = VttHandler + TESTFILE_BASE = "openformats/tests/formats/vtt/files" + + def test_vtt_metadata(self): + """vtt: Test that metadata is included in template but not included in stringset.""" + source = strip_leading_spaces("""WEBVTT + + STYLE + ::cue(v) { + color: red; + } + + REGION + id:fred + width:40% + + 1 + 00:01:28.797 --> 00:01:30.297 + Hello, World! + + NOTE want this test to pass + """) + template, stringset = self.handler.parse(source) + for str in stringset: + s = str.string + self.assertFalse('WEBVTT' in s or 'STYLE' in s or 'REGION' in s or 'NOTE' in s, + 'Metadata should not be present in stringset!') + break + self.assertIn('WEBVTT', template) + self.assertIn('STYLE', template) + self.assertIn('REGION', template) + self.assertIn('NOTE', template) + + source = strip_leading_spaces(""" + 00:01:28.797 --> 00:01:30.297 + Check the first line + """) + self._test_parse_error(source, "VTT file should start with 'WEBVTT'!") + + def test_vtt_occurrences(self): + """vtt: Test that timings are saved as occurrencies.""" + source = strip_leading_spaces("""WEBVTT + + 1 + 00:01:28.797 --> 00:01:30.297 + Hello, World! + """) + _, stringset = self.handler.parse(source) + self.assertEqual(stringset[0].occurrences, '00:01:28.797,00:01:30.297') + + def test_missing_string(self): + source = strip_leading_spaces("""WEBVTT + + 1 + 00:01:28.797 --> 00:01:30.297 + """) + self._test_parse_error( + source, + "Subtitle is empty on line 5" + ) + + def test_full_and_short_timings(self): + source = strip_leading_spaces("""WEBVTT + + 00:01:28.797 --> 00:01:30.297 + Full timings hh:mm:ss.fff + + 01:28.797 --> 01:30.297 + Short timings mm:ss.fff + + 28.797 --> 30.297 + Abnormal timings format ss.fff + """) + self._test_parse_error( + source, + "Unexpected timing format on line 11" + ) + + def test_wrong_timings(self): + source = strip_leading_spaces("""WEBVTT + + 1 + 00:01:28.797 ---> 00:01:30.297 + Hello, World! + """) + self._test_parse_error( + source, + "Timings on line 4 don't follow '[start] --> [end] (position)' " + "pattern" + ) + + source = strip_leading_spaces("""WEBVTT + + 1 + 00:fas28.797 --> 00:01:30.297 + Hello, World! + """) + self._test_parse_error( + source, + "Problem with start of timing at line 4: '00:fas28.797'" + ) + + source = strip_leading_spaces("""WEBVTT + + 1 + 00:01:28.797 --> 00:ois30.297 + Hello, World! + """) + self._test_parse_error( + source, + "Problem with end of timing at line 4: '00:ois30.297'" + ) diff --git a/openformats/tests/utils/dictionary.py b/openformats/tests/utils/dictionary.py index 263a38f8..2b93b516 100644 --- a/openformats/tests/utils/dictionary.py +++ b/openformats/tests/utils/dictionary.py @@ -28,7 +28,7 @@ class FunkyDictionary(object): def __init__(self): self.phrase_list = [] self.phrase_dict = {} - with open(DICT_FNAME, 'rU', encoding='utf-8') as dict_file: + with open(DICT_FNAME, 'r', encoding='utf-8', newline=None) as dict_file: dict_reader = csv.DictReader(dict_file) for phrase in dict_reader: unicode_phrase = {} diff --git a/requirements.txt b/requirements.txt index 89db3630..1a1e570c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,8 @@ pyparsing==2.2.0 six lxml==4.6.5 beautifulsoup4==4.9.3 +pytest +mock # InDesign git+https://github.com/kbairak/ucflib@py3_compatibility From f3ab5070fcd9d2e7a92760644efca716cbb1c13a Mon Sep 17 00:00:00 2001 From: dsactionengine <128065385+dsavinov-actionengine@users.noreply.github.com> Date: Fri, 24 May 2024 16:19:48 +0200 Subject: [PATCH 2/3] Create OpenString order parameter --- openformats/formats/vtt.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py index 8e457de3..d0cc8431 100644 --- a/openformats/formats/vtt.py +++ b/openformats/formats/vtt.py @@ -1,3 +1,4 @@ +from itertools import count import re from ..handlers import Handler @@ -26,6 +27,7 @@ def parse(self, content): self.transcriber = Transcriber(content) source = self.transcriber.source stringset = [] + self._order = count() for start, subtitle_section in self._generate_split_subtitles(source): self.transcriber.copy_until(start) offset, string = self._parse_section(start, subtitle_section) @@ -101,7 +103,8 @@ def _parse_section(self, offset, section): raise ParseError(f"Subtitle is empty on line {self.transcriber.line_number + 2}") string = OpenString(timings, string_to_translate, - occurrences=f"{start},{end}") + occurrences=f"{start},{end}", + order=next(self._order)) offset += len(identifier) + len(timings) + 1; if len(identifier): offset += 1 From 3fdc7838b5c1f7d77ccb2f084db5d10cd0cad456 Mon Sep 17 00:00:00 2001 From: dsactionengine <128065385+dsavinov-actionengine@users.noreply.github.com> Date: Mon, 3 Jun 2024 13:40:05 +0200 Subject: [PATCH 3/3] fixed review comments --- openformats/formats/vtt.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py index d0cc8431..53fa0881 100644 --- a/openformats/formats/vtt.py +++ b/openformats/formats/vtt.py @@ -58,6 +58,7 @@ def _parse_section(self, offset, section): if "-->" in str: timings = str timings_index = i + break if timings_index < 0: return None, None @@ -137,7 +138,10 @@ def compile(self, template, stringset, **kwargs): transcriber = Transcriber(template) template = transcriber.source stringset = iter(stringset) - string = next(stringset) + try: + string = next(stringset) + except StopIteration: + raise ParseError("stringset cannot be empty") for start, subtitle_section in self._generate_split_subtitles(template): transcriber.copy_until(start) @@ -147,8 +151,12 @@ def compile(self, template, stringset, **kwargs): hash_position = -1 if subtitle_section.count('-->') > 0: arrow_pos = subtitle_section.index('-->') - end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->')) - hash_position = end_of_timings + 1 + try: + end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->')) + hash_position = end_of_timings + 1 + except ValueError: + # No newlines after timing: subtitle is missing + pass if hash_position < 0: transcriber.copy_until(start + len(subtitle_section))