Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let more encodings through and allow registration of encodings #176

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datashape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .type_symbol_table import *
from .discovery import discover
from .util import *
from .internal_utils import register_encoding
from .promote import promote, optionify
from .error import DataShapeSyntaxError

Expand Down
55 changes: 15 additions & 40 deletions datashape/coretypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@

import ctypes
import operator

from math import ceil

import numpy as np

from .py2help import _inttypes, _strtypes, unicode, OrderedDict
from .internal_utils import IndexCallable, isidentifier
from .internal_utils import IndexCallable, isidentifier, canonical_name


# Classes of unit types.
Expand Down Expand Up @@ -313,31 +314,13 @@ def __str__(self):
return 'bytes'


_canonical_string_encodings = {
u'A' : u'A',
u'ascii' : u'A',
u'U8' : u'U8',
u'utf-8' : u'U8',
u'utf_8' : u'U8',
u'utf8' : u'U8',
u'U16' : u'U16',
u'utf-16' : u'U16',
u'utf_16' : u'U16',
u'utf16' : u'U16',
u'U32' : u'U32',
u'utf-32' : u'U32',
u'utf_32' : u'U32',
u'utf32' : u'U32'
}


class String(Unit):
""" String container

>>> String()
ctype("string")
>>> String(10, 'ascii')
ctype("string[10, 'A']")
ctype("string[10, 'ascii']")
"""
cls = MEASURE
__slots__ = 'fixlen', 'encoding'
Expand All @@ -353,30 +336,24 @@ def __init__(self, *args):
if len(args) == 2:
fixlen, encoding = args

encoding = encoding or 'U8'
encoding = encoding or u'utf8'
if isinstance(encoding, str):
encoding = unicode(encoding)
try:
encoding = _canonical_string_encodings[encoding]
except KeyError:
raise ValueError('Unsupported string encoding %s' %
repr(encoding))

self.encoding = encoding
self.encoding = canonical_name(encoding)
self.fixlen = fixlen

# Put it in a canonical form

def __str__(self):
if self.fixlen is None and self.encoding == 'U8':
utf8 = canonical_name('utf8')
if self.fixlen is None and self.encoding == utf8:
return 'string'
elif self.fixlen is not None and self.encoding == 'U8':
elif self.fixlen is not None and self.encoding == utf8:
return 'string[%i]' % self.fixlen
elif self.fixlen is None and self.encoding != 'U8':
return 'string[%s]' % repr(self.encoding).strip('u')
elif self.fixlen is None and self.encoding != utf8:
return 'string[%s]' % repr(self.encoding).lstrip('u')
else:
return 'string[%i, %s]' % (self.fixlen,
repr(self.encoding).strip('u'))
repr(self.encoding).lstrip('u'))

def __repr__(self):
s = str(self)
Expand All @@ -393,10 +370,8 @@ def to_numpy_dtype(self):
dtype('S30')
"""
if self.fixlen:
if self.encoding == 'A':
return np.dtype('S%d' % self.fixlen)
else:
return np.dtype('U%d' % self.fixlen)
prefix = 'S' if self.encoding == 'ascii' else 'U'
return np.dtype('%s%d' % (prefix, self.fixlen))

from .py2help import unicode
# Create a dtype with metadata indicating it's
Expand Down Expand Up @@ -685,7 +660,7 @@ def from_numpy_dtype(self, dt):
>>> CType.from_numpy_dtype(dtype('M8'))
DateTime(None)
>>> CType.from_numpy_dtype(dtype('U30'))
ctype("string[30, 'U32']")
ctype("string[30, 'utf-32']")
"""
try:
return Type.lookup_type(dt.name)
Expand Down Expand Up @@ -1139,7 +1114,7 @@ def from_numpy(shape, dt):
dshape("5 * 5 * int32")

>>> from_numpy((10,), dtype('S10'))
dshape("10 * string[10, 'A']")
dshape("10 * string[10, 'ascii']")
"""
dtype = np.dtype(dt)

Expand Down
42 changes: 42 additions & 0 deletions datashape/internal_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import keyword
import re
import codecs


class IndexCallable(object):
Expand Down Expand Up @@ -121,3 +122,44 @@ def groupby(func, seq):
def isidentifier(s):
return (keyword.iskeyword(s) or
re.match(r'^[_a-zA-Z][_a-zA-Z0-9]*$', s) is not None)


_canonical_string_encodings = {}


def register_encoding(encoding, canonical_name=None):
"""Register an encoding with datashape.

Parameters
----------
encoding : str
The name of the encoding
canonical_name : str, optional
The canonical name of the encoding. Defaults to `name`.
"""
try:
canonical_name = codecs.lookup(encoding).name
except LookupError:
pass
else:
raise ValueError('encoding %r already registered and maps to %r' %
(encoding, canonical_name))
if encoding in _canonical_string_encodings:
raise ValueError('encoding %r already registered and maps to %r' %
(encoding, _canonical_string_encodings[encoding]))
return _canonical_string_encodings.setdefault(encoding,
canonical_name or encoding)


def canonical_name(encoding):
try:
return codecs.lookup(encoding).name
except LookupError:
if encoding not in _canonical_string_encodings:
raise ValueError('Invalid encoding %r. You can register the '
'encoding with datashape.register_encoding(%r)' %
(encoding, encoding))
return _canonical_string_encodings[encoding]


register_encoding('A', 'ascii')
2 changes: 1 addition & 1 deletion datashape/tests/test_coretypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def test_ascii_string(self):

def test_string(self):
assert (from_numpy((2,), np.dtype('U7')) ==
dshape('2 * string[7, "U32"]'))
dshape('2 * string[7, "utf32"]'))

def test_string_from_CType_classmethod(self):
assert CType.from_numpy_dtype(np.dtype('S7')) == String(7, 'A')
Expand Down
18 changes: 9 additions & 9 deletions datashape/tests/test_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest

import datashape
from datashape import dshape, error, DataShape, Record
from datashape import dshape, error, DataShape, Record, register_encoding


class TestDataShapeCreation(unittest.TestCase):
Expand Down Expand Up @@ -70,14 +70,14 @@ def test_type_decl_concrete(self):

def test_string_atom(self):
self.assertEqual(dshape('string'), dshape("string['U8']"))
self.assertEqual(dshape("string['ascii']")[0].encoding, 'A')
self.assertEqual(dshape("string['A']")[0].encoding, 'A')
self.assertEqual(dshape("string['utf-8']")[0].encoding, 'U8')
self.assertEqual(dshape("string['U8']")[0].encoding, 'U8')
self.assertEqual(dshape("string['utf-16']")[0].encoding, 'U16')
self.assertEqual(dshape("string['U16']")[0].encoding, 'U16')
self.assertEqual(dshape("string['utf-32']")[0].encoding, 'U32')
self.assertEqual(dshape("string['U32']")[0].encoding, 'U32')
self.assertEqual(dshape("string['ascii']")[0].encoding, 'ascii')
self.assertEqual(dshape("string['A']")[0].encoding, 'ascii')
self.assertEqual(dshape("string['utf-8']")[0].encoding, 'utf-8')
self.assertEqual(dshape("string['U8']")[0].encoding, 'utf-8')
self.assertEqual(dshape("string['utf-16']")[0].encoding, 'utf-16')
self.assertEqual(dshape("string['U16']")[0].encoding, 'utf-16')
self.assertEqual(dshape("string['utf-32']")[0].encoding, 'utf-32')
self.assertEqual(dshape("string['U32']")[0].encoding, 'utf-32')

def test_time(self):
self.assertEqual(dshape('time')[0].tz, None)
Expand Down
4 changes: 2 additions & 2 deletions datashape/tests/test_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_primitive_measure_str(self):
self.assertEqual(str(datashape.float64), 'float64')
self.assertEqual(str(datashape.string), 'string')
self.assertEqual(str(datashape.String(3)), 'string[3]')
self.assertEqual(str(datashape.String('A')), "string['A']")
self.assertEqual(str(datashape.String('A')), "string['ascii']")

def test_structure_str(self):
self.assertEqual(str(dshape('{x:int32, y:int64}')),
Expand All @@ -43,7 +43,7 @@ def test_primitive_measure_repr(self):
self.assertEqual(repr(datashape.string), 'ctype("string")')
self.assertEqual(repr(datashape.String(3)), 'ctype("string[3]")')
self.assertEqual(repr(datashape.String('A')),
"""ctype("string['A']")""")
"""ctype("string['ascii']")""")

def test_structure_repr(self):
self.assertEqual(repr(dshape('{x:int32, y:int64}')),
Expand Down
12 changes: 12 additions & 0 deletions datashape/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import datashape
from datashape import dshape, has_var_dim, has_ellipsis
from datashape import register_encoding


def test_cat_dshapes():
Expand Down Expand Up @@ -70,3 +71,14 @@ def test_has_ellipsis(ds):
(dshape("M * int32"),)])
def test_not_has_ellipsis(ds):
assert not has_ellipsis(ds)


def test_register_codec():
with pytest.raises(ValueError):
assert dshape("string['utf8mb4']").measure.encoding == 'utf8mb4'

register_encoding('utf8mb4')
assert dshape("string['utf8mb4']").measure.encoding == 'utf8mb4'

with pytest.raises(ValueError):
register_encoding('utf8mb4')
2 changes: 0 additions & 2 deletions datashape/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from __future__ import print_function, division, absolute_import

import operator
import ctypes
import sys

from . import py2help
from . import parser
Expand Down