From 045dc3fe4258eb8be572037676d4864d3c5a15db Mon Sep 17 00:00:00 2001
From: dsactionengine
 <128065385+dsavinov-actionengine@users.noreply.github.com>
Date: Fri, 17 May 2024 19:56:33 +0200
Subject: [PATCH 1/3] Adding support for VTT format

---
 bin/create_files.py                           |   3 +-
 docker/Dockerfile                             |   1 +
 openformats/formats/vtt.py                    | 174 ++++++++++++++++++
 openformats/tests/formats/vtt/__init__.py     |   0
 openformats/tests/formats/vtt/files/1_el.vtt  |  35 ++++
 openformats/tests/formats/vtt/files/1_en.vtt  |  35 ++++
 openformats/tests/formats/vtt/files/1_tpl.vtt |  32 ++++
 openformats/tests/formats/vtt/test_vtt.py     | 120 ++++++++++++
 openformats/tests/utils/dictionary.py         |   2 +-
 requirements.txt                              |   2 +
 10 files changed, 402 insertions(+), 2 deletions(-)
 create mode 100644 openformats/formats/vtt.py
 create mode 100644 openformats/tests/formats/vtt/__init__.py
 create mode 100644 openformats/tests/formats/vtt/files/1_el.vtt
 create mode 100644 openformats/tests/formats/vtt/files/1_en.vtt
 create mode 100644 openformats/tests/formats/vtt/files/1_tpl.vtt
 create mode 100644 openformats/tests/formats/vtt/test_vtt.py

diff --git a/bin/create_files.py b/bin/create_files.py
index c26d9236..4114513f 100755
--- a/bin/create_files.py
+++ b/bin/create_files.py
@@ -15,7 +15,7 @@
 from io import open
 
 from openformats.formats import (android, github_markdown_v2, json, plaintext,
-                                 po, srt)
+                                 po, srt, vtt)
 from openformats.tests.utils import translate_stringset
 
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -29,6 +29,7 @@ def get_handler(ext):
     return {
         'txt': plaintext.PlaintextHandler(),
         'srt': srt.SrtHandler(),
+        'vtt': vtt.VttHandler(),
         'xml': android.AndroidHandler(),
         'json': json.JsonHandler(),
         'po': po.PoHandler(),
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1bf56f7a..d7e9919f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,6 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends $PKGS && \
 ENV PYTHONDONTWRITEBYTECODE=1
 
 COPY requirements.txt /requirements.txt
+RUN pip install --upgrade pip
 RUN pip install -r /requirements.txt
 
 WORKDIR /app
diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py
new file mode 100644
index 00000000..8e457de3
--- /dev/null
+++ b/openformats/formats/vtt.py
@@ -0,0 +1,174 @@
+import re
+
+from ..handlers import Handler
+from openformats.exceptions import ParseError
+from openformats.strings import OpenString
+from openformats.transcribers import Transcriber
+
+
+class VttHandler(Handler):
+    name = "VTT"
+    extension = "vtt"
+    EXTRACTS_RAW = False
+
+    NON_SPACE_PAT = re.compile(r'[^\s]')
+
+    def _generate_split_subtitles(self, content, **kwargs):
+        start = 0
+        for section in content.split('\n\n'):  # sections are separated by blank lines
+            # find first non-space character of section
+            match = self.NON_SPACE_PAT.search(section)
+            if match:
+                yield start + match.start(), section.strip()
+            start += len(section) + 2
+
+    def parse(self, content):
+        self.transcriber = Transcriber(content)
+        source = self.transcriber.source
+        stringset = []
+        for start, subtitle_section in self._generate_split_subtitles(source):
+            self.transcriber.copy_until(start)
+            offset, string = self._parse_section(start, subtitle_section)
+
+            if string:
+                stringset.append(string)
+
+                self.transcriber.copy_until(offset)
+                self.transcriber.add(string.template_replacement)
+                self.transcriber.skip(len(string.string))
+            else:
+                self.transcriber.copy_until(start + len(subtitle_section))
+
+        self.transcriber.copy_until(len(source))
+
+        template = self.transcriber.get_destination()
+        if not template.startswith('WEBVTT'):
+            raise ParseError("VTT file should start with 'WEBVTT'!")
+        return template, stringset
+
+    def _parse_section(self, offset, section):
+        src_strings = section.split('\n')  # identifier_str is optional in VTT
+
+        timings = ""
+        timings_index = -1
+        for i in range(len(src_strings)):
+            str = src_strings[i];
+            if "-->" in str:
+                timings = str
+                timings_index = i
+
+        if timings_index < 0:
+            return None, None
+
+        # Identifier (lines preceding the line with timings) is optional in VTT.
+        # Identifier can be either numberic or textual, and it is not necessarily unique.
+        identifier = '\n'.join(src_strings[:timings_index])
+
+        # timings
+        timings_parse_error = False
+        try:
+            splitted = timings.split(None, 3)
+            if len(splitted) == 3:
+                start, arrow, end = splitted
+            else:
+                start, arrow, end, _ = splitted
+        except ValueError:
+            timings_parse_error = True
+        else:
+            if arrow != "-->":
+                timings_parse_error = True
+        if timings_parse_error:
+            raise ParseError(
+                f"Timings on line {self.transcriber.line_number + 1} "
+                "don't follow '[start] --> [end] (position)' pattern"
+            )
+        try:
+            start = self._format_timing(start)
+        except ValueError:
+            raise ParseError(
+                f"Problem with start of timing at line {self.transcriber.line_number + 1}: '{start}'"
+            )
+        try:
+            end = self._format_timing(end)
+        except ValueError:
+            raise ParseError(
+                f"Problem with end of timing at line {self.transcriber.line_number + 1}: '{end}'"
+            )
+
+        # Content
+        string_to_translate = '\n'.join(src_strings[timings_index+1:])
+        if string_to_translate == "":
+            raise ParseError(f"Subtitle is empty on line {self.transcriber.line_number + 2}")
+
+        string = OpenString(timings, string_to_translate,
+                            occurrences=f"{start},{end}")
+        offset += len(identifier) + len(timings) + 1;
+        if len(identifier):
+            offset += 1
+        return offset, string
+
+    def _format_timing(self, timing):
+        try:
+            rest, milliseconds = timing.split('.')
+            milliseconds = f"{milliseconds:<03}"
+        except ValueError:
+            rest, milliseconds = timing, "000"
+        # timing may or may not contain hours part
+        if rest.count(':') == 1:
+            minutes, seconds = rest.split(':')
+            minutes, seconds, milliseconds = (int(minutes),
+                                              int(seconds),
+                                              int(milliseconds))
+            return f"{minutes:02}:{seconds:02}.{milliseconds:03}"
+        elif rest.count(':') == 2:
+            hours, minutes, seconds = rest.split(':')
+            hours, minutes, seconds, milliseconds = (int(hours),
+                                                    int(minutes),
+                                                    int(seconds),
+                                                    int(milliseconds))
+            return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
+        else:
+            raise ParseError(f"Unexpected timing format on line {self.transcriber.line_number + 2}")
+
+    def compile(self, template, stringset, **kwargs):
+        transcriber = Transcriber(template)
+        template = transcriber.source
+        stringset = iter(stringset)
+        string = next(stringset)
+
+        for start, subtitle_section in self._generate_split_subtitles(template):
+            transcriber.copy_until(start)
+            transcriber.mark_section_start()
+
+            # Find hash after timings
+            hash_position = -1
+            if subtitle_section.count('-->') > 0:
+                arrow_pos = subtitle_section.index('-->')
+                end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->'))
+                hash_position = end_of_timings + 1
+
+            if hash_position < 0:
+                transcriber.copy_until(start + len(subtitle_section))
+                transcriber.mark_section_end()
+            elif (subtitle_section[
+                    hash_position:
+                    hash_position + len(string.template_replacement)
+                    ] == string.template_replacement):
+                # found it
+                transcriber.copy_until(start + hash_position)
+                transcriber.add(string.string)
+                transcriber.skip(len(string.template_replacement))
+                transcriber.copy_until(start + len(subtitle_section))
+                transcriber.mark_section_end()
+                try:
+                    string = next(stringset)
+                except StopIteration:
+                    pass
+            else:
+                # did not find it, must remove section
+                transcriber.copy_until(start + len(subtitle_section))
+                transcriber.mark_section_end()
+                transcriber.remove_section()
+
+        transcriber.copy_until(len(template))
+        return transcriber.get_destination()
diff --git a/openformats/tests/formats/vtt/__init__.py b/openformats/tests/formats/vtt/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/openformats/tests/formats/vtt/files/1_el.vtt b/openformats/tests/formats/vtt/files/1_el.vtt
new file mode 100644
index 00000000..67f93a3e
--- /dev/null
+++ b/openformats/tests/formats/vtt/files/1_el.vtt
@@ -0,0 +1,35 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+Γεια σου, Κόσμε!
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+Pinky: Brain, ρι θες να κάνουμε απόψε;
+Brain: Ό,τι και κάθε βράδυ, Pinky: θα κατακτήσουμε τον κόσμο!
+
+3
+00:02:45.105 --> 00:02:47.940
+el:A phrase with escaped &lt;HTML tags&gt;
+
+4
+00:03:45.105 --> 00:03:47.940
+el:<font color="#00ff00">A phrase with <b>HTML</b> characters</font>
+
+5
+00:05:45.105 --> 00:05:47.940
+el:A phrase with unicode characters: ΑβΓδΕ → ♡ Ш
+
+6
+00:06:45.105 --> 00:06:47.940
+el:Three lines: First
+Second
+Third
diff --git a/openformats/tests/formats/vtt/files/1_en.vtt b/openformats/tests/formats/vtt/files/1_en.vtt
new file mode 100644
index 00000000..2201aa1a
--- /dev/null
+++ b/openformats/tests/formats/vtt/files/1_en.vtt
@@ -0,0 +1,35 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+Hello, World!
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+Pinky: Gee, Brain, what do you want to do tonight?
+Brain: The same thing we do every night, Pinky - try to take over the world!
+
+3
+00:02:45.105 --> 00:02:47.940
+A phrase with escaped &lt;HTML tags&gt;
+
+4
+00:03:45.105 --> 00:03:47.940
+<font color="#00ff00">A phrase with <b>HTML</b> characters</font>
+
+5
+00:05:45.105 --> 00:05:47.940
+A phrase with unicode characters: ΑβΓδΕ → ♡ Ш
+
+6
+00:06:45.105 --> 00:06:47.940
+Three lines: First
+Second
+Third
diff --git a/openformats/tests/formats/vtt/files/1_tpl.vtt b/openformats/tests/formats/vtt/files/1_tpl.vtt
new file mode 100644
index 00000000..6485f94a
--- /dev/null
+++ b/openformats/tests/formats/vtt/files/1_tpl.vtt
@@ -0,0 +1,32 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+c386a46eaaa5ecd18e760683c3e36987_tr
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+f3736d657f04cedbb1eefd07e7fb4e53_tr
+
+3
+00:02:45.105 --> 00:02:47.940
+12a3c29d1c2ead6744096c2bcf5cb5a0_tr
+
+4
+00:03:45.105 --> 00:03:47.940
+32189023ec2e2af1c96ff6e50889a8e5_tr
+
+5
+00:05:45.105 --> 00:05:47.940
+df27c645bb92280c825e3e1c94a3f0b8_tr
+
+6
+00:06:45.105 --> 00:06:47.940
+22394ab09ce61d63e1f9d56ef64c4e40_tr
diff --git a/openformats/tests/formats/vtt/test_vtt.py b/openformats/tests/formats/vtt/test_vtt.py
new file mode 100644
index 00000000..f5b35ba0
--- /dev/null
+++ b/openformats/tests/formats/vtt/test_vtt.py
@@ -0,0 +1,120 @@
+import unittest
+
+from openformats.tests.formats.common import CommonFormatTestMixin
+from openformats.tests.utils import strip_leading_spaces
+from openformats.formats.vtt import VttHandler
+
+
+class VttTestCase(CommonFormatTestMixin, unittest.TestCase):
+    HANDLER_CLASS = VttHandler
+    TESTFILE_BASE = "openformats/tests/formats/vtt/files"
+
+    def test_vtt_metadata(self):
+        """vtt: Test that metadata is included in template but not included in stringset."""
+        source = strip_leading_spaces("""WEBVTT
+
+            STYLE
+            ::cue(v) {
+            color: red;
+            }
+
+            REGION
+            id:fred
+            width:40%
+
+            1
+            00:01:28.797 --> 00:01:30.297
+            Hello, World!
+
+            NOTE want this test to pass
+        """)
+        template, stringset = self.handler.parse(source)
+        for str in stringset:
+            s = str.string
+            self.assertFalse('WEBVTT' in s or 'STYLE' in s or 'REGION' in s or 'NOTE' in s,
+                             'Metadata should not be present in stringset!')
+            break
+        self.assertIn('WEBVTT', template)
+        self.assertIn('STYLE', template)
+        self.assertIn('REGION', template)
+        self.assertIn('NOTE', template)
+
+        source = strip_leading_spaces("""
+            00:01:28.797 --> 00:01:30.297
+            Check the first line
+        """)
+        self._test_parse_error(source, "VTT file should start with 'WEBVTT'!")
+
+    def test_vtt_occurrences(self):
+        """vtt: Test that timings are saved as occurrencies."""
+        source = strip_leading_spaces("""WEBVTT
+
+            1
+            00:01:28.797 --> 00:01:30.297
+            Hello, World!
+        """)
+        _, stringset = self.handler.parse(source)
+        self.assertEqual(stringset[0].occurrences, '00:01:28.797,00:01:30.297')
+
+    def test_missing_string(self):
+        source = strip_leading_spaces("""WEBVTT
+
+            1
+            00:01:28.797 --> 00:01:30.297
+        """)
+        self._test_parse_error(
+            source,
+            "Subtitle is empty on line 5"
+        )
+
+    def test_full_and_short_timings(self):
+        source = strip_leading_spaces("""WEBVTT
+
+            00:01:28.797 --> 00:01:30.297
+            Full timings hh:mm:ss.fff
+
+            01:28.797 --> 01:30.297
+            Short timings mm:ss.fff
+
+            28.797 --> 30.297
+            Abnormal timings format ss.fff
+        """)
+        self._test_parse_error(
+            source,
+            "Unexpected timing format on line 11"
+        )
+
+    def test_wrong_timings(self):
+        source = strip_leading_spaces("""WEBVTT
+
+            1
+            00:01:28.797 ---> 00:01:30.297
+            Hello, World!
+        """)
+        self._test_parse_error(
+            source,
+            "Timings on line 4 don't follow '[start] --> [end] (position)' "
+            "pattern"
+        )
+
+        source = strip_leading_spaces("""WEBVTT
+
+            1
+            00:fas28.797 --> 00:01:30.297
+            Hello, World!
+        """)
+        self._test_parse_error(
+            source,
+            "Problem with start of timing at line 4: '00:fas28.797'"
+        )
+
+        source = strip_leading_spaces("""WEBVTT
+
+            1
+            00:01:28.797 --> 00:ois30.297
+            Hello, World!
+        """)
+        self._test_parse_error(
+            source,
+            "Problem with end of timing at line 4: '00:ois30.297'"
+        )
diff --git a/openformats/tests/utils/dictionary.py b/openformats/tests/utils/dictionary.py
index 263a38f8..2b93b516 100644
--- a/openformats/tests/utils/dictionary.py
+++ b/openformats/tests/utils/dictionary.py
@@ -28,7 +28,7 @@ class FunkyDictionary(object):
     def __init__(self):
         self.phrase_list = []
         self.phrase_dict = {}
-        with open(DICT_FNAME, 'rU', encoding='utf-8') as dict_file:
+        with open(DICT_FNAME, 'r', encoding='utf-8', newline=None) as dict_file:
             dict_reader = csv.DictReader(dict_file)
             for phrase in dict_reader:
                 unicode_phrase = {}
diff --git a/requirements.txt b/requirements.txt
index 89db3630..1a1e570c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,8 @@ pyparsing==2.2.0
 six
 lxml==4.6.5
 beautifulsoup4==4.9.3
+pytest
+mock
 
 # InDesign
 git+https://github.com/kbairak/ucflib@py3_compatibility

From f3ab5070fcd9d2e7a92760644efca716cbb1c13a Mon Sep 17 00:00:00 2001
From: dsactionengine
 <128065385+dsavinov-actionengine@users.noreply.github.com>
Date: Fri, 24 May 2024 16:19:48 +0200
Subject: [PATCH 2/3] Create OpenString order parameter

---
 openformats/formats/vtt.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py
index 8e457de3..d0cc8431 100644
--- a/openformats/formats/vtt.py
+++ b/openformats/formats/vtt.py
@@ -1,3 +1,4 @@
+from itertools import count
 import re
 
 from ..handlers import Handler
@@ -26,6 +27,7 @@ def parse(self, content):
         self.transcriber = Transcriber(content)
         source = self.transcriber.source
         stringset = []
+        self._order = count()
         for start, subtitle_section in self._generate_split_subtitles(source):
             self.transcriber.copy_until(start)
             offset, string = self._parse_section(start, subtitle_section)
@@ -101,7 +103,8 @@ def _parse_section(self, offset, section):
             raise ParseError(f"Subtitle is empty on line {self.transcriber.line_number + 2}")
 
         string = OpenString(timings, string_to_translate,
-                            occurrences=f"{start},{end}")
+                            occurrences=f"{start},{end}",
+                            order=next(self._order))
         offset += len(identifier) + len(timings) + 1;
         if len(identifier):
             offset += 1

From 3fdc7838b5c1f7d77ccb2f084db5d10cd0cad456 Mon Sep 17 00:00:00 2001
From: dsactionengine
 <128065385+dsavinov-actionengine@users.noreply.github.com>
Date: Mon, 3 Jun 2024 13:40:05 +0200
Subject: [PATCH 3/3] fixed review comments

---
 openformats/formats/vtt.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py
index d0cc8431..53fa0881 100644
--- a/openformats/formats/vtt.py
+++ b/openformats/formats/vtt.py
@@ -58,6 +58,7 @@ def _parse_section(self, offset, section):
             if "-->" in str:
                 timings = str
                 timings_index = i
+                break
 
         if timings_index < 0:
             return None, None
@@ -137,7 +138,10 @@ def compile(self, template, stringset, **kwargs):
         transcriber = Transcriber(template)
         template = transcriber.source
         stringset = iter(stringset)
-        string = next(stringset)
+        try:
+            string = next(stringset)
+        except StopIteration:
+            raise ParseError("stringset cannot be empty")
 
         for start, subtitle_section in self._generate_split_subtitles(template):
             transcriber.copy_until(start)
@@ -147,8 +151,12 @@ def compile(self, template, stringset, **kwargs):
             hash_position = -1
             if subtitle_section.count('-->') > 0:
                 arrow_pos = subtitle_section.index('-->')
-                end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->'))
-                hash_position = end_of_timings + 1
+                try:
+                    end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->'))
+                    hash_position = end_of_timings + 1
+                except ValueError:
+                    # No newlines after timing: subtitle is missing
+                    pass
 
             if hash_position < 0:
                 transcriber.copy_until(start + len(subtitle_section))