From 4df7e668009199eb987168a1f757cb3a6f1ee6ae Mon Sep 17 00:00:00 2001 From: "Dylan Kiss (dyki)" Date: Thu, 10 Oct 2024 10:46:07 +0200 Subject: [PATCH] Allow extracting nested calls in Javascript Currently the Javascript extractor does not support nested gettext calls at all. The extraction code was refactored a bit to resemble the Python code as much as possible and support this use-case. --- babel/messages/extract.py | 208 +++++++++++++++++++----------- tests/messages/test_js_extract.py | 25 ++++ 2 files changed, 156 insertions(+), 77 deletions(-) diff --git a/babel/messages/extract.py b/babel/messages/extract.py index c46c719db..6dc458c1c 100644 --- a/babel/messages/extract.py +++ b/babel/messages/extract.py @@ -706,54 +706,109 @@ def extract_javascript( :param lineno: line number offset (for parsing embedded fragments) """ from babel.messages.jslexer import Token, tokenize, unquote_string - funcname = message_lineno = None - messages = [] - last_argument = None - translator_comments = [] - concatenate_next = False + encoding = options.get('encoding', 'utf-8') - last_token = None - call_stack = -1 dotted = any('.' in kw for kw in keywords) + last_token = None + # Keep the stack of all function calls and its related contextual variables, + # so we can handle nested gettext calls. + function_stack = [] + # Keep track of whether we're in a class or function definition + in_def = False + # Keep track of whether we're in a block of translator comments + in_translator_comments = False + # Keep track of the last encountered translator comments + translator_comments = [] + # Keep track of the (split) strings encountered + message_buffer = [] + for token in tokenize( fileobj.read().decode(encoding), - jsx=options.get("jsx", True), - template_string=options.get("template_string", True), + jsx=options.get('jsx', True), + template_string=options.get('template_string', True), dotted=dotted, lineno=lineno, ): - if ( # Turn keyword`foo` expressions into keyword("foo") calls: - funcname and # have a keyword... - (last_token and last_token.type == 'name') and # we've seen nothing after the keyword... - token.type == 'template_string' # this is a template string + if token.type == 'name' and token.value in ('class', 'function'): + # We're entering a class or function definition + in_def = True + + elif in_def and token.type == 'operator' and token.value in ('(', '{'): + # We're in a class or function definition and should not do anything + in_def = False + continue + + elif ( + last_token + and last_token.type == 'name' + and last_token.value in keywords + and token.type == 'template_string' ): - message_lineno = token.lineno - messages = [unquote_string(token.value)] - call_stack = 0 + # Turn keyword`foo` expressions into keyword("foo") function calls + string_value = unquote_string(token.value) + cur_translator_comments = translator_comments + if function_stack and function_stack[-1]['function_line_no'] == last_token.lineno: + # If our current function call is on the same line as the previous one, + # copy their translator comments, since they also apply to us. + cur_translator_comments = function_stack[-1]['translator_comments'] + + # We add all information needed later for the current function call + function_stack.append({ + 'function_line_no': last_token.lineno, + 'function_name': last_token.value, + 'message_line_no': token.lineno, + 'messages': [string_value], + 'translator_comments': cur_translator_comments, + }) + translator_comments = [] + + # We act as if we are closing the function call now token = Token('operator', ')', token.lineno) - if options.get('parse_template_string') and not funcname and token.type == 'template_string': + if ( + options.get('parse_template_string') + and (not last_token or last_token.type != 'name' or last_token.value not in keywords) + and token.type == 'template_string' + ): yield from parse_template_string(token.value, keywords, comment_tags, options, token.lineno) elif token.type == 'operator' and token.value == '(': - if funcname: - message_lineno = token.lineno - call_stack += 1 + if last_token.type == 'name': + # We're entering a function call + cur_translator_comments = translator_comments + if function_stack and function_stack[-1]['function_line_no'] == token.lineno: + # If our current function call is on the same line as the previous one, + # copy their translator comments, since they also apply to us. + cur_translator_comments = function_stack[-1]['translator_comments'] + + # We add all information needed later for the current function call + function_stack.append({ + 'function_line_no': token.lineno, + 'function_name': last_token.value, + 'message_line_no': None, + 'messages': [], + 'translator_comments': cur_translator_comments, + }) + translator_comments = [] - elif call_stack == -1 and token.type == 'linecomment': + elif token.type == 'linecomment': + # Strip the comment token from the line value = token.value[2:].strip() - if translator_comments and \ - translator_comments[-1][0] == token.lineno - 1: + if in_translator_comments and translator_comments[-1][0] == token.lineno - 1: + # We're already inside a translator comment, continue appending translator_comments.append((token.lineno, value)) continue for comment_tag in comment_tags: if value.startswith(comment_tag): - translator_comments.append((token.lineno, value.strip())) + # Comment starts with one of the comment tags, + # so let's start capturing it + in_translator_comments = True + translator_comments.append((token.lineno, value)) break elif token.type == 'multilinecomment': - # only one multi-line comment may precede a translation + # Only one multi-line comment may precede a translation translator_comments = [] value = token.value[2:-2].strip() for comment_tag in comment_tags: @@ -763,68 +818,67 @@ def extract_javascript( lines[0] = lines[0].strip() lines[1:] = dedent('\n'.join(lines[1:])).splitlines() for offset, line in enumerate(lines): - translator_comments.append((token.lineno + offset, - line)) + translator_comments.append((token.lineno + offset, line)) break - elif funcname and call_stack == 0: + elif function_stack and function_stack[-1]['function_name'] in keywords: + # We're inside a translation function call if token.type == 'operator' and token.value == ')': - if last_argument is not None: - messages.append(last_argument) - if len(messages) > 1: - messages = tuple(messages) - elif messages: - messages = messages[0] + # The call has ended, so we yield the translatable term(s) + messages = function_stack[-1]['messages'] + line_no = ( + function_stack[-1]['message_line_no'] + or function_stack[-1]['function_line_no'] + ) + cur_translator_comments = function_stack[-1]['translator_comments'] + + if message_buffer: + messages.append(''.join(message_buffer)) + message_buffer.clear() else: - messages = None + messages.append(None) - # Comments don't apply unless they immediately precede the - # message - if translator_comments and \ - translator_comments[-1][0] < message_lineno - 1: - translator_comments = [] + messages = tuple(messages) if len(messages) > 1 else messages[0] + if ( + cur_translator_comments + and cur_translator_comments[-1][0] < line_no - 1 + ): + # The translator comments are not immediately preceding the current + # term, so we skip them. + cur_translator_comments = [] - if messages is not None: - yield (message_lineno, funcname, messages, - [comment[1] for comment in translator_comments]) + yield ( + line_no, + function_stack[-1]['function_name'], + messages, + [comment[1] for comment in cur_translator_comments], + ) - funcname = message_lineno = last_argument = None - concatenate_next = False - translator_comments = [] - messages = [] - call_stack = -1 + function_stack.pop() elif token.type in ('string', 'template_string'): - new_value = unquote_string(token.value) - if concatenate_next: - last_argument = (last_argument or '') + new_value - concatenate_next = False + # We've encountered a string inside a translation function call + string_value = unquote_string(token.value) + if not function_stack[-1]['message_line_no']: + function_stack[-1]['message_line_no'] = token.lineno + if string_value is not None: + message_buffer.append(string_value) + + elif token.type == 'operator' and token.value == ',': + # End of a function call argument + if message_buffer: + function_stack[-1]['messages'].append(''.join(message_buffer)) + message_buffer.clear() else: - last_argument = new_value - - elif token.type == 'operator': - if token.value == ',': - if last_argument is not None: - messages.append(last_argument) - last_argument = None - else: - messages.append(None) - concatenate_next = False - elif token.value == '+': - concatenate_next = True - - elif call_stack > 0 and token.type == 'operator' \ - and token.value == ')': - call_stack -= 1 - - elif funcname and call_stack == -1: - funcname = None - - elif call_stack == -1 and token.type == 'name' and \ - token.value in keywords and \ - (last_token is None or last_token.type != 'name' or - last_token.value != 'function'): - funcname = token.value + function_stack[-1]['messages'].append(None) + + elif function_stack and token.type == 'operator' and token.value == ')': + function_stack.pop() + + if in_translator_comments and translator_comments[-1][0] < token.lineno: + # We have a newline in between the comments, so they don't belong + # together anymore + in_translator_comments = False last_token = token diff --git a/tests/messages/test_js_extract.py b/tests/messages/test_js_extract.py index fc643851e..6b11d7618 100644 --- a/tests/messages/test_js_extract.py +++ b/tests/messages/test_js_extract.py @@ -191,3 +191,28 @@ def test_inside_nested_template_string(): ) assert messages == [(1, 'Greetings!', [], None), (1, 'This is a lovely evening.', [], None), (1, 'The day is really nice!', [], None)] + +def test_nested_gettext_calls(): + buf = BytesIO(b"""\ +gettext("Hello %s", gettext("User")); +gettext("Hello %(user)s", { user: gettext("User") }); +gettext("Hello %s", dummy.dummyFunction(gettext("User"))); +gettext( + "Hello %(user)s", + { user: dummy.dummyFunction(gettext("User")) }, +); +""") + messages = list( + extract.extract('javascript', buf, {"gettext": None}, [], {}), + ) + + assert messages == [ + (1, 'User', [], None), + (1, 'Hello %s', [], None), + (2, 'User', [], None), + (2, 'Hello %(user)s', [], None), + (3, 'User', [], None), + (3, 'Hello %s', [], None), + (6, 'User', [], None), + (5, 'Hello %(user)s', [], None), + ]