From d425f86a08d5f459d7380d7c196ecb33af564f5c Mon Sep 17 00:00:00 2001 From: Johannes Wilm Date: Fri, 6 Jan 2023 21:18:35 +0100 Subject: [PATCH] Improved javascript template string expression extracting (#939) Co-authored-by: Rik Co-authored-by: Aarni Koskela --- babel/messages/extract.py | 59 +++++++++++++++++++++++++++---- babel/messages/jslexer.py | 4 +-- tests/messages/test_js_extract.py | 39 ++++++++++++++++++++ 3 files changed, 94 insertions(+), 8 deletions(-) diff --git a/babel/messages/extract.py b/babel/messages/extract.py index 4f0f649b3..c19dd5af2 100644 --- a/babel/messages/extract.py +++ b/babel/messages/extract.py @@ -16,9 +16,10 @@ :license: BSD, see LICENSE for more details. """ import ast +import io import os -from os.path import relpath import sys +from os.path import relpath from tokenize import generate_tokens, COMMENT, NAME, OP, STRING from babel.util import parse_encoding, parse_future_flags, pathmatch @@ -532,7 +533,7 @@ def _parse_python_string(value, encoding, future_flags): return None -def extract_javascript(fileobj, keywords, comment_tags, options): +def extract_javascript(fileobj, keywords, comment_tags, options, lineno=1): """Extract messages from JavaScript source code. :param fileobj: the seekable, file-like object the messages should be @@ -544,7 +545,11 @@ def extract_javascript(fileobj, keywords, comment_tags, options): :param options: a dictionary of additional options (optional) Supported options are: * `jsx` -- set to false to disable JSX/E4X support. - * `template_string` -- set to false to disable ES6 template string support. + * `template_string` -- if `True`, supports gettext(`key`) + * `parse_template_string` -- if `True` will parse the + contents of javascript + template strings. + :param lineno: line number offset (for parsing embedded fragments) """ from babel.messages.jslexer import Token, tokenize, unquote_string funcname = message_lineno = None @@ -556,12 +561,12 @@ def extract_javascript(fileobj, keywords, comment_tags, options): last_token = None call_stack = -1 dotted = any('.' in kw for kw in keywords) - for token in tokenize( fileobj.read().decode(encoding), jsx=options.get("jsx", True), template_string=options.get("template_string", True), - dotted=dotted + dotted=dotted, + lineno=lineno ): if ( # Turn keyword`foo` expressions into keyword("foo") calls: funcname and # have a keyword... @@ -573,7 +578,11 @@ def extract_javascript(fileobj, keywords, comment_tags, options): call_stack = 0 token = Token('operator', ')', token.lineno) - if token.type == 'operator' and token.value == '(': + if options.get('parse_template_string') and not funcname and token.type == 'template_string': + for item in parse_template_string(token.value, keywords, comment_tags, options, token.lineno): + yield item + + elif token.type == 'operator' and token.value == '(': if funcname: message_lineno = token.lineno call_stack += 1 @@ -665,3 +674,41 @@ def extract_javascript(fileobj, keywords, comment_tags, options): funcname = token.value last_token = token + + +def parse_template_string(template_string, keywords, comment_tags, options, lineno=1): + """Parse JavaScript template string. + + :param template_string: the template string to be parsed + :param keywords: a list of keywords (i.e. function names) that should be + recognized as translation functions + :param comment_tags: a list of translator tags to search for and include + in the results + :param options: a dictionary of additional options (optional) + :param lineno: starting line number (optional) + """ + from babel.messages.jslexer import line_re + prev_character = None + level = 0 + inside_str = False + expression_contents = '' + for character in template_string[1:-1]: + if not inside_str and character in ('"', "'", '`'): + inside_str = character + elif inside_str == character and prev_character != r'\\': + inside_str = False + if level: + expression_contents += character + if not inside_str: + if character == '{' and prev_character == '$': + level += 1 + elif level and character == '}': + level -= 1 + if level == 0 and expression_contents: + expression_contents = expression_contents[0:-1] + fake_file_obj = io.BytesIO(expression_contents.encode()) + for item in extract_javascript(fake_file_obj, keywords, comment_tags, options, lineno): + yield item + lineno += len(line_re.findall(expression_contents)) + expression_contents = '' + prev_character = character diff --git a/babel/messages/jslexer.py b/babel/messages/jslexer.py index 1264b2dbc..886f69d20 100644 --- a/babel/messages/jslexer.py +++ b/babel/messages/jslexer.py @@ -151,17 +151,17 @@ def unquote_string(string): return u''.join(result) -def tokenize(source, jsx=True, dotted=True, template_string=True): +def tokenize(source, jsx=True, dotted=True, template_string=True, lineno=1): """ Tokenize JavaScript/JSX source. Returns a generator of tokens. :param jsx: Enable (limited) JSX parsing. :param dotted: Read dotted names as single name token. :param template_string: Support ES6 template strings + :param lineno: starting line number (optional) """ may_divide = False pos = 0 - lineno = 1 end = len(source) rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string) diff --git a/tests/messages/test_js_extract.py b/tests/messages/test_js_extract.py index 72c521144..95985c0f7 100644 --- a/tests/messages/test_js_extract.py +++ b/tests/messages/test_js_extract.py @@ -150,3 +150,42 @@ def test_template_string_tag_usage(): ) assert messages == [(1, 'Tag template, wow', [], None)] + + +def test_inside_template_string(): + buf = BytesIO(b"const msg = `${gettext('Hello')} ${user.name}`") + messages = list( + extract.extract('javascript', buf, {"gettext": None}, [], {'parse_template_string': True}) + ) + + assert messages == [(1, 'Hello', [], None)] + + +def test_inside_template_string_with_linebreaks(): + buf = BytesIO(b"""\ +const userName = gettext('Username') +const msg = `${ +gettext('Hello') +} ${userName} ${ +gettext('Are you having a nice day?') +}` +const msg2 = `${ +gettext('Howdy') +} ${userName} ${ +gettext('Are you doing ok?') +}` +""") + messages = list( + extract.extract('javascript', buf, {"gettext": None}, [], {'parse_template_string': True}) + ) + + assert messages == [(1, 'Username', [], None), (3, 'Hello', [], None), (5, 'Are you having a nice day?', [], None), (8, 'Howdy', [], None), (10, 'Are you doing ok?', [], None)] + + +def test_inside_nested_template_string(): + buf = BytesIO(b"const msg = `${gettext('Greetings!')} ${ evening ? `${user.name}: ${gettext('This is a lovely evening.')}` : `${gettext('The day is really nice!')} ${user.name}`}`") + messages = list( + extract.extract('javascript', buf, {"gettext": None}, [], {'parse_template_string': True}) + ) + + assert messages == [(1, 'Greetings!', [], None), (1, 'This is a lovely evening.', [], None), (1, 'The day is really nice!', [], None)]