From 045dc3fe4258eb8be572037676d4864d3c5a15db Mon Sep 17 00:00:00 2001
From: dsactionengine
<128065385+dsavinov-actionengine@users.noreply.github.com>
Date: Fri, 17 May 2024 19:56:33 +0200
Subject: [PATCH 1/3] Adding support for VTT format
---
bin/create_files.py | 3 +-
docker/Dockerfile | 1 +
openformats/formats/vtt.py | 174 ++++++++++++++++++
openformats/tests/formats/vtt/__init__.py | 0
openformats/tests/formats/vtt/files/1_el.vtt | 35 ++++
openformats/tests/formats/vtt/files/1_en.vtt | 35 ++++
openformats/tests/formats/vtt/files/1_tpl.vtt | 32 ++++
openformats/tests/formats/vtt/test_vtt.py | 120 ++++++++++++
openformats/tests/utils/dictionary.py | 2 +-
requirements.txt | 2 +
10 files changed, 402 insertions(+), 2 deletions(-)
create mode 100644 openformats/formats/vtt.py
create mode 100644 openformats/tests/formats/vtt/__init__.py
create mode 100644 openformats/tests/formats/vtt/files/1_el.vtt
create mode 100644 openformats/tests/formats/vtt/files/1_en.vtt
create mode 100644 openformats/tests/formats/vtt/files/1_tpl.vtt
create mode 100644 openformats/tests/formats/vtt/test_vtt.py
diff --git a/bin/create_files.py b/bin/create_files.py
index c26d9236..4114513f 100755
--- a/bin/create_files.py
+++ b/bin/create_files.py
@@ -15,7 +15,7 @@
from io import open
from openformats.formats import (android, github_markdown_v2, json, plaintext,
- po, srt)
+ po, srt, vtt)
from openformats.tests.utils import translate_stringset
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -29,6 +29,7 @@ def get_handler(ext):
return {
'txt': plaintext.PlaintextHandler(),
'srt': srt.SrtHandler(),
+ 'vtt': vtt.VttHandler(),
'xml': android.AndroidHandler(),
'json': json.JsonHandler(),
'po': po.PoHandler(),
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1bf56f7a..d7e9919f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,6 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends $PKGS && \
ENV PYTHONDONTWRITEBYTECODE=1
COPY requirements.txt /requirements.txt
+RUN pip install --upgrade pip
RUN pip install -r /requirements.txt
WORKDIR /app
diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py
new file mode 100644
index 00000000..8e457de3
--- /dev/null
+++ b/openformats/formats/vtt.py
@@ -0,0 +1,174 @@
+import re
+
+from ..handlers import Handler
+from openformats.exceptions import ParseError
+from openformats.strings import OpenString
+from openformats.transcribers import Transcriber
+
+
+class VttHandler(Handler):
+ name = "VTT"
+ extension = "vtt"
+ EXTRACTS_RAW = False
+
+ NON_SPACE_PAT = re.compile(r'[^\s]')
+
+ def _generate_split_subtitles(self, content, **kwargs):
+ start = 0
+ for section in content.split('\n\n'): # sections are separated by blank lines
+ # find first non-space character of section
+ match = self.NON_SPACE_PAT.search(section)
+ if match:
+ yield start + match.start(), section.strip()
+ start += len(section) + 2
+
+ def parse(self, content):
+ self.transcriber = Transcriber(content)
+ source = self.transcriber.source
+ stringset = []
+ for start, subtitle_section in self._generate_split_subtitles(source):
+ self.transcriber.copy_until(start)
+ offset, string = self._parse_section(start, subtitle_section)
+
+ if string:
+ stringset.append(string)
+
+ self.transcriber.copy_until(offset)
+ self.transcriber.add(string.template_replacement)
+ self.transcriber.skip(len(string.string))
+ else:
+ self.transcriber.copy_until(start + len(subtitle_section))
+
+ self.transcriber.copy_until(len(source))
+
+ template = self.transcriber.get_destination()
+ if not template.startswith('WEBVTT'):
+ raise ParseError("VTT file should start with 'WEBVTT'!")
+ return template, stringset
+
+ def _parse_section(self, offset, section):
+ src_strings = section.split('\n') # identifier_str is optional in VTT
+
+ timings = ""
+ timings_index = -1
+ for i in range(len(src_strings)):
+ str = src_strings[i];
+ if "-->" in str:
+ timings = str
+ timings_index = i
+
+ if timings_index < 0:
+ return None, None
+
+ # Identifier (lines preceding the line with timings) is optional in VTT.
+ # Identifier can be either numberic or textual, and it is not necessarily unique.
+ identifier = '\n'.join(src_strings[:timings_index])
+
+ # timings
+ timings_parse_error = False
+ try:
+ splitted = timings.split(None, 3)
+ if len(splitted) == 3:
+ start, arrow, end = splitted
+ else:
+ start, arrow, end, _ = splitted
+ except ValueError:
+ timings_parse_error = True
+ else:
+ if arrow != "-->":
+ timings_parse_error = True
+ if timings_parse_error:
+ raise ParseError(
+ f"Timings on line {self.transcriber.line_number + 1} "
+ "don't follow '[start] --> [end] (position)' pattern"
+ )
+ try:
+ start = self._format_timing(start)
+ except ValueError:
+ raise ParseError(
+ f"Problem with start of timing at line {self.transcriber.line_number + 1}: '{start}'"
+ )
+ try:
+ end = self._format_timing(end)
+ except ValueError:
+ raise ParseError(
+ f"Problem with end of timing at line {self.transcriber.line_number + 1}: '{end}'"
+ )
+
+ # Content
+ string_to_translate = '\n'.join(src_strings[timings_index+1:])
+ if string_to_translate == "":
+ raise ParseError(f"Subtitle is empty on line {self.transcriber.line_number + 2}")
+
+ string = OpenString(timings, string_to_translate,
+ occurrences=f"{start},{end}")
+ offset += len(identifier) + len(timings) + 1;
+ if len(identifier):
+ offset += 1
+ return offset, string
+
+ def _format_timing(self, timing):
+ try:
+ rest, milliseconds = timing.split('.')
+ milliseconds = f"{milliseconds:<03}"
+ except ValueError:
+ rest, milliseconds = timing, "000"
+ # timing may or may not contain hours part
+ if rest.count(':') == 1:
+ minutes, seconds = rest.split(':')
+ minutes, seconds, milliseconds = (int(minutes),
+ int(seconds),
+ int(milliseconds))
+ return f"{minutes:02}:{seconds:02}.{milliseconds:03}"
+ elif rest.count(':') == 2:
+ hours, minutes, seconds = rest.split(':')
+ hours, minutes, seconds, milliseconds = (int(hours),
+ int(minutes),
+ int(seconds),
+ int(milliseconds))
+ return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
+ else:
+ raise ParseError(f"Unexpected timing format on line {self.transcriber.line_number + 2}")
+
+ def compile(self, template, stringset, **kwargs):
+ transcriber = Transcriber(template)
+ template = transcriber.source
+ stringset = iter(stringset)
+ string = next(stringset)
+
+ for start, subtitle_section in self._generate_split_subtitles(template):
+ transcriber.copy_until(start)
+ transcriber.mark_section_start()
+
+ # Find hash after timings
+ hash_position = -1
+ if subtitle_section.count('-->') > 0:
+ arrow_pos = subtitle_section.index('-->')
+ end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->'))
+ hash_position = end_of_timings + 1
+
+ if hash_position < 0:
+ transcriber.copy_until(start + len(subtitle_section))
+ transcriber.mark_section_end()
+ elif (subtitle_section[
+ hash_position:
+ hash_position + len(string.template_replacement)
+ ] == string.template_replacement):
+ # found it
+ transcriber.copy_until(start + hash_position)
+ transcriber.add(string.string)
+ transcriber.skip(len(string.template_replacement))
+ transcriber.copy_until(start + len(subtitle_section))
+ transcriber.mark_section_end()
+ try:
+ string = next(stringset)
+ except StopIteration:
+ pass
+ else:
+ # did not find it, must remove section
+ transcriber.copy_until(start + len(subtitle_section))
+ transcriber.mark_section_end()
+ transcriber.remove_section()
+
+ transcriber.copy_until(len(template))
+ return transcriber.get_destination()
diff --git a/openformats/tests/formats/vtt/__init__.py b/openformats/tests/formats/vtt/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/openformats/tests/formats/vtt/files/1_el.vtt b/openformats/tests/formats/vtt/files/1_el.vtt
new file mode 100644
index 00000000..67f93a3e
--- /dev/null
+++ b/openformats/tests/formats/vtt/files/1_el.vtt
@@ -0,0 +1,35 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+Γεια σου, Κόσμε!
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+Pinky: Brain, ρι θες να κάνουμε απόψε;
+Brain: Ό,τι και κάθε βράδυ, Pinky: θα κατακτήσουμε τον κόσμο!
+
+3
+00:02:45.105 --> 00:02:47.940
+el:A phrase with escaped <HTML tags>
+
+4
+00:03:45.105 --> 00:03:47.940
+el:A phrase with HTML characters
+
+5
+00:05:45.105 --> 00:05:47.940
+el:A phrase with unicode characters: ΑβΓδΕ → ♡ Ш
+
+6
+00:06:45.105 --> 00:06:47.940
+el:Three lines: First
+Second
+Third
diff --git a/openformats/tests/formats/vtt/files/1_en.vtt b/openformats/tests/formats/vtt/files/1_en.vtt
new file mode 100644
index 00000000..2201aa1a
--- /dev/null
+++ b/openformats/tests/formats/vtt/files/1_en.vtt
@@ -0,0 +1,35 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+Hello, World!
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+Pinky: Gee, Brain, what do you want to do tonight?
+Brain: The same thing we do every night, Pinky - try to take over the world!
+
+3
+00:02:45.105 --> 00:02:47.940
+A phrase with escaped <HTML tags>
+
+4
+00:03:45.105 --> 00:03:47.940
+A phrase with HTML characters
+
+5
+00:05:45.105 --> 00:05:47.940
+A phrase with unicode characters: ΑβΓδΕ → ♡ Ш
+
+6
+00:06:45.105 --> 00:06:47.940
+Three lines: First
+Second
+Third
diff --git a/openformats/tests/formats/vtt/files/1_tpl.vtt b/openformats/tests/formats/vtt/files/1_tpl.vtt
new file mode 100644
index 00000000..6485f94a
--- /dev/null
+++ b/openformats/tests/formats/vtt/files/1_tpl.vtt
@@ -0,0 +1,32 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+c386a46eaaa5ecd18e760683c3e36987_tr
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+f3736d657f04cedbb1eefd07e7fb4e53_tr
+
+3
+00:02:45.105 --> 00:02:47.940
+12a3c29d1c2ead6744096c2bcf5cb5a0_tr
+
+4
+00:03:45.105 --> 00:03:47.940
+32189023ec2e2af1c96ff6e50889a8e5_tr
+
+5
+00:05:45.105 --> 00:05:47.940
+df27c645bb92280c825e3e1c94a3f0b8_tr
+
+6
+00:06:45.105 --> 00:06:47.940
+22394ab09ce61d63e1f9d56ef64c4e40_tr
diff --git a/openformats/tests/formats/vtt/test_vtt.py b/openformats/tests/formats/vtt/test_vtt.py
new file mode 100644
index 00000000..f5b35ba0
--- /dev/null
+++ b/openformats/tests/formats/vtt/test_vtt.py
@@ -0,0 +1,120 @@
+import unittest
+
+from openformats.tests.formats.common import CommonFormatTestMixin
+from openformats.tests.utils import strip_leading_spaces
+from openformats.formats.vtt import VttHandler
+
+
+class VttTestCase(CommonFormatTestMixin, unittest.TestCase):
+ HANDLER_CLASS = VttHandler
+ TESTFILE_BASE = "openformats/tests/formats/vtt/files"
+
+ def test_vtt_metadata(self):
+ """vtt: Test that metadata is included in template but not included in stringset."""
+ source = strip_leading_spaces("""WEBVTT
+
+ STYLE
+ ::cue(v) {
+ color: red;
+ }
+
+ REGION
+ id:fred
+ width:40%
+
+ 1
+ 00:01:28.797 --> 00:01:30.297
+ Hello, World!
+
+ NOTE want this test to pass
+ """)
+ template, stringset = self.handler.parse(source)
+ for str in stringset:
+ s = str.string
+ self.assertFalse('WEBVTT' in s or 'STYLE' in s or 'REGION' in s or 'NOTE' in s,
+ 'Metadata should not be present in stringset!')
+ break
+ self.assertIn('WEBVTT', template)
+ self.assertIn('STYLE', template)
+ self.assertIn('REGION', template)
+ self.assertIn('NOTE', template)
+
+ source = strip_leading_spaces("""
+ 00:01:28.797 --> 00:01:30.297
+ Check the first line
+ """)
+ self._test_parse_error(source, "VTT file should start with 'WEBVTT'!")
+
+ def test_vtt_occurrences(self):
+ """vtt: Test that timings are saved as occurrencies."""
+ source = strip_leading_spaces("""WEBVTT
+
+ 1
+ 00:01:28.797 --> 00:01:30.297
+ Hello, World!
+ """)
+ _, stringset = self.handler.parse(source)
+ self.assertEqual(stringset[0].occurrences, '00:01:28.797,00:01:30.297')
+
+ def test_missing_string(self):
+ source = strip_leading_spaces("""WEBVTT
+
+ 1
+ 00:01:28.797 --> 00:01:30.297
+ """)
+ self._test_parse_error(
+ source,
+ "Subtitle is empty on line 5"
+ )
+
+ def test_full_and_short_timings(self):
+ source = strip_leading_spaces("""WEBVTT
+
+ 00:01:28.797 --> 00:01:30.297
+ Full timings hh:mm:ss.fff
+
+ 01:28.797 --> 01:30.297
+ Short timings mm:ss.fff
+
+ 28.797 --> 30.297
+ Abnormal timings format ss.fff
+ """)
+ self._test_parse_error(
+ source,
+ "Unexpected timing format on line 11"
+ )
+
+ def test_wrong_timings(self):
+ source = strip_leading_spaces("""WEBVTT
+
+ 1
+ 00:01:28.797 ---> 00:01:30.297
+ Hello, World!
+ """)
+ self._test_parse_error(
+ source,
+ "Timings on line 4 don't follow '[start] --> [end] (position)' "
+ "pattern"
+ )
+
+ source = strip_leading_spaces("""WEBVTT
+
+ 1
+ 00:fas28.797 --> 00:01:30.297
+ Hello, World!
+ """)
+ self._test_parse_error(
+ source,
+ "Problem with start of timing at line 4: '00:fas28.797'"
+ )
+
+ source = strip_leading_spaces("""WEBVTT
+
+ 1
+ 00:01:28.797 --> 00:ois30.297
+ Hello, World!
+ """)
+ self._test_parse_error(
+ source,
+ "Problem with end of timing at line 4: '00:ois30.297'"
+ )
diff --git a/openformats/tests/utils/dictionary.py b/openformats/tests/utils/dictionary.py
index 263a38f8..2b93b516 100644
--- a/openformats/tests/utils/dictionary.py
+++ b/openformats/tests/utils/dictionary.py
@@ -28,7 +28,7 @@ class FunkyDictionary(object):
def __init__(self):
self.phrase_list = []
self.phrase_dict = {}
- with open(DICT_FNAME, 'rU', encoding='utf-8') as dict_file:
+ with open(DICT_FNAME, 'r', encoding='utf-8', newline=None) as dict_file:
dict_reader = csv.DictReader(dict_file)
for phrase in dict_reader:
unicode_phrase = {}
diff --git a/requirements.txt b/requirements.txt
index 89db3630..1a1e570c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,8 @@ pyparsing==2.2.0
six
lxml==4.6.5
beautifulsoup4==4.9.3
+pytest
+mock
# InDesign
git+https://github.com/kbairak/ucflib@py3_compatibility
From f3ab5070fcd9d2e7a92760644efca716cbb1c13a Mon Sep 17 00:00:00 2001
From: dsactionengine
<128065385+dsavinov-actionengine@users.noreply.github.com>
Date: Fri, 24 May 2024 16:19:48 +0200
Subject: [PATCH 2/3] Create OpenString order parameter
---
openformats/formats/vtt.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py
index 8e457de3..d0cc8431 100644
--- a/openformats/formats/vtt.py
+++ b/openformats/formats/vtt.py
@@ -1,3 +1,4 @@
+from itertools import count
import re
from ..handlers import Handler
@@ -26,6 +27,7 @@ def parse(self, content):
self.transcriber = Transcriber(content)
source = self.transcriber.source
stringset = []
+ self._order = count()
for start, subtitle_section in self._generate_split_subtitles(source):
self.transcriber.copy_until(start)
offset, string = self._parse_section(start, subtitle_section)
@@ -101,7 +103,8 @@ def _parse_section(self, offset, section):
raise ParseError(f"Subtitle is empty on line {self.transcriber.line_number + 2}")
string = OpenString(timings, string_to_translate,
- occurrences=f"{start},{end}")
+ occurrences=f"{start},{end}",
+ order=next(self._order))
offset += len(identifier) + len(timings) + 1;
if len(identifier):
offset += 1
From 3fdc7838b5c1f7d77ccb2f084db5d10cd0cad456 Mon Sep 17 00:00:00 2001
From: dsactionengine
<128065385+dsavinov-actionengine@users.noreply.github.com>
Date: Mon, 3 Jun 2024 13:40:05 +0200
Subject: [PATCH 3/3] fixed review comments
---
openformats/formats/vtt.py | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py
index d0cc8431..53fa0881 100644
--- a/openformats/formats/vtt.py
+++ b/openformats/formats/vtt.py
@@ -58,6 +58,7 @@ def _parse_section(self, offset, section):
if "-->" in str:
timings = str
timings_index = i
+ break
if timings_index < 0:
return None, None
@@ -137,7 +138,10 @@ def compile(self, template, stringset, **kwargs):
transcriber = Transcriber(template)
template = transcriber.source
stringset = iter(stringset)
- string = next(stringset)
+ try:
+ string = next(stringset)
+ except StopIteration:
+ raise ParseError("stringset cannot be empty")
for start, subtitle_section in self._generate_split_subtitles(template):
transcriber.copy_until(start)
@@ -147,8 +151,12 @@ def compile(self, template, stringset, **kwargs):
hash_position = -1
if subtitle_section.count('-->') > 0:
arrow_pos = subtitle_section.index('-->')
- end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->'))
- hash_position = end_of_timings + 1
+ try:
+ end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->'))
+ hash_position = end_of_timings + 1
+ except ValueError:
+ # No newlines after timing: subtitle is missing
+ pass
if hash_position < 0:
transcriber.copy_until(start + len(subtitle_section))