Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved JavaScript extraction #332

Merged
merged 6 commits into from
Mar 9, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions babel/messages/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,8 +506,12 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
:param comment_tags: a list of translator tags to search for and include
in the results
:param options: a dictionary of additional options (optional)
Supported options are:
* `jsx` -- set to false to disable JSX/E4X support.
* `template_string` -- set to false to disable ES6
template string support.
"""
from babel.messages.jslexer import tokenize, unquote_string
from babel.messages.jslexer import Token, tokenize, unquote_string
funcname = message_lineno = None
messages = []
last_argument = None
Expand All @@ -516,8 +520,24 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
encoding = options.get('encoding', 'utf-8')
last_token = None
call_stack = -1
dotted = any('.' in kw for kw in keywords)

for token in tokenize(
fileobj.read().decode(encoding),
jsx=options.get("jsx", True),
template_string=options.get("template_string", True),
dotted=dotted
):
if ( # Turn keyword`foo` expressions into keyword("foo") calls:
funcname and # have a keyword...
(last_token and last_token.type == 'name') and # we've seen nothing after the keyword...
token.type == 'template_string' # this is a template string
):
message_lineno = token.lineno
messages = [unquote_string(token.value)]
call_stack = 0
token = Token('operator', ')', token.lineno)

for token in tokenize(fileobj.read().decode(encoding)):
if token.type == 'operator' and token.value == '(':
if funcname:
message_lineno = token.lineno
Expand Down Expand Up @@ -577,7 +597,7 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
messages = []
call_stack = -1

elif token.type == 'string':
elif token.type in ('string', 'template_string'):
new_value = unquote_string(token.value)
if concatenate_next:
last_argument = (last_argument or '') + new_value
Expand Down
71 changes: 45 additions & 26 deletions babel/messages/jslexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,57 +9,70 @@
:copyright: (c) 2013 by the Babel Team.
:license: BSD, see LICENSE for more details.
"""

from operator import itemgetter
from collections import namedtuple
import re
from babel._compat import unichr

operators = [
operators = sorted([
'+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
'+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
'>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
'[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
]
operators.sort(key=lambda a: -len(a))
], key=len, reverse=True)

escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}

rules = [
name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
division_re = re.compile(r'/=?')
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
line_re = re.compile(r'(\r\n|\n|\r)')
line_join_re = re.compile(r'\\' + line_re.pattern)
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')

Token = namedtuple('Token', 'type value lineno')

_rules = [
(None, re.compile(r'\s+(?u)')),
(None, re.compile(r'<!--.*')),
('linecomment', re.compile(r'//.*')),
('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
('dotted_name', dotted_name_re),
('name', name_re),
('number', re.compile(r'''(?x)(
(?:0|[1-9]\d*)
(\.\d+)?
([eE][-+]?\d+)? |
(0x[a-fA-F0-9]+)
)''')),
('jsx_tag', re.compile(r'<(?:/?)\w+.+?>', re.I)), # May be mangled in `get_rules`
('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
('string', re.compile(r'''(?xs)(
'(?:[^'\\]*(?:\\.[^'\\]*)*)' |
"(?:[^"\\]*(?:\\.[^"\\]*)*)"
)'''))
]

division_re = re.compile(r'/=?')
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
line_re = re.compile(r'(\r\n|\n|\r)')
line_join_re = re.compile(r'\\' + line_re.pattern)
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')


class Token(tuple):
"""Represents a token as returned by `tokenize`."""
__slots__ = ()

def __new__(cls, type, value, lineno):
return tuple.__new__(cls, (type, value, lineno))
def get_rules(jsx, dotted, template_string):
"""
Get a tokenization rule list given the passed syntax options.

type = property(itemgetter(0))
value = property(itemgetter(1))
lineno = property(itemgetter(2))
Internal to this module.
"""
rules = []
for token_type, rule in _rules:
if not jsx and token_type and 'jsx' in token_type:
continue
if not template_string and token_type == 'template_string':
continue
if token_type == 'dotted_name':
if not dotted:
continue
token_type = 'name'
rules.append((token_type, rule))
return rules


def indicates_division(token):
Expand All @@ -73,9 +86,9 @@ def indicates_division(token):

def unquote_string(string):
"""Unquote a string with JavaScript rules. The string has to start with
string delimiters (``'`` or ``"``.)
string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
"""
assert string and string[0] == string[-1] and string[0] in '"\'', \
assert string and string[0] == string[-1] and string[0] in '"\'`', \
'string provided is not properly delimited'
string = line_join_re.sub('\\1', string[1:-1])
result = []
Expand Down Expand Up @@ -127,13 +140,19 @@ def unquote_string(string):
return u''.join(result)


def tokenize(source):
"""Tokenize a JavaScript source. Returns a generator of tokens.
def tokenize(source, jsx=True, dotted=True, template_string=True):
"""
Tokenize JavaScript/JSX source. Returns a generator of tokens.

:param jsx: Enable (limited) JSX parsing.
:param dotted: Read dotted names as single name token.
:param template_string: Support ES6 template strings
"""
may_divide = False
pos = 0
lineno = 1
end = len(source)
rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)

while pos < end:
# handle regular rules first
Expand Down
91 changes: 0 additions & 91 deletions tests/messages/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,97 +410,6 @@ def test_extract_strip_comment_tags(self):
u'a prefix too'], messages[1][2])


class ExtractJavaScriptTestCase(unittest.TestCase):

def test_simple_extract(self):
buf = BytesIO(b"""\
msg1 = _('simple')
msg2 = gettext('simple')
msg3 = ngettext('s', 'p', 42)
""")
messages = \
list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS,
[], {}))

self.assertEqual([(1, 'simple', [], None),
(2, 'simple', [], None),
(3, ('s', 'p'), [], None)], messages)

def test_various_calls(self):
buf = BytesIO(b"""\
msg1 = _(i18n_arg.replace(/"/, '"'))
msg2 = ungettext(i18n_arg.replace(/"/, '"'), multi_arg.replace(/"/, '"'), 2)
msg3 = ungettext("Babel", multi_arg.replace(/"/, '"'), 2)
msg4 = ungettext(i18n_arg.replace(/"/, '"'), "Babels", 2)
msg5 = ungettext('bunny', 'bunnies', parseInt(Math.random() * 2 + 1))
msg6 = ungettext(arg0, 'bunnies', rparseInt(Math.random() * 2 + 1))
msg7 = _(hello.there)
msg8 = gettext('Rabbit')
msg9 = dgettext('wiki', model.addPage())
msg10 = dngettext(domain, 'Page', 'Pages', 3)
""")
messages = \
list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, [],
{}))
self.assertEqual([(5, (u'bunny', u'bunnies'), [], None),
(8, u'Rabbit', [], None),
(10, (u'Page', u'Pages'), [], None)], messages)

def test_message_with_line_comment(self):
buf = BytesIO(u"""\
// NOTE: hello
msg = _('Bonjour à tous')
""".encode('utf-8'))
messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
self.assertEqual(u'Bonjour à tous', messages[0][2])
self.assertEqual([u'NOTE: hello'], messages[0][3])

def test_message_with_multiline_comment(self):
buf = BytesIO(u"""\
/* NOTE: hello
and bonjour
and servus */
msg = _('Bonjour à tous')
""".encode('utf-8'))
messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
self.assertEqual(u'Bonjour à tous', messages[0][2])
self.assertEqual([u'NOTE: hello', 'and bonjour', ' and servus'], messages[0][3])

def test_ignore_function_definitions(self):
buf = BytesIO(b"""\
function gettext(value) {
return translations[language][value] || value;
}""")

messages = list(extract.extract_javascript(buf, ('gettext',), [], {}))
self.assertEqual(messages, [])

def test_misplaced_comments(self):
buf = BytesIO(b"""\
/* NOTE: this won't show up */
foo()

/* NOTE: this will */
msg = _('Something')

// NOTE: this will show up
// too.
msg = _('Something else')

// NOTE: but this won't
bar()

_('no comment here')
""")
messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
self.assertEqual(u'Something', messages[0][2])
self.assertEqual([u'NOTE: this will'], messages[0][3])
self.assertEqual(u'Something else', messages[1][2])
self.assertEqual([u'NOTE: this will show up', 'too.'], messages[1][3])
self.assertEqual(u'no comment here', messages[2][2])
self.assertEqual([], messages[2][3])


class ExtractTestCase(unittest.TestCase):

def test_invalid_filter(self):
Expand Down
Loading