Skip to content

Commit

Permalink
Merge pull request #189 from cvzi/versions
Browse files Browse the repository at this point in the history
Emoji Versions
  • Loading branch information
TahirJalilov authored Oct 4, 2021
2 parents 57f010a + 250073a commit 36e2419
Show file tree
Hide file tree
Showing 17 changed files with 43,234 additions and 21,658 deletions.
9 changes: 9 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
emoji
=====

1.6.0
-----
* Fix Unicode of some emoji in the language files
* is_emoji function added
* Added dict of dict with emoji data include emoji versions and statuses
* emoji.version(string) method added
* Included 'variant' in the dict of dicts


1.5.0
-----
* Emojis of English version updated to the Emoji Charts v14.0
Expand Down
7 changes: 4 additions & 3 deletions emoji/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,17 @@

__all__ = [
# emoji.core
'emojize', 'demojize', 'get_emoji_regexp', 'emoji_count', 'emoji_lis', 'replace_emoji',
'emojize', 'demojize', 'get_emoji_regexp', 'emoji_count', 'emoji_lis',
'replace_emoji', 'version',
# emoji.unicode_codes
'EMOJI_UNICODE_ENGLISH', 'EMOJI_UNICODE_SPANISH', 'EMOJI_UNICODE_PORTUGUESE',
'EMOJI_UNICODE_ITALIAN', 'EMOJI_UNICODE_FRENCH', 'EMOJI_UNICODE_GERMAN',
'UNICODE_EMOJI_ENGLISH', 'UNICODE_EMOJI_SPANISH', 'UNICODE_EMOJI_PORTUGUESE',
'UNICODE_EMOJI_ITALIAN', 'UNICODE_EMOJI_FRENCH', 'UNICODE_EMOJI_GERMAN',
'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH',
'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH', 'EMOJI_DATA',
]

__version__ = '1.5.0'
__version__ = '1.6.0'
__author__ = 'Taehoon Kim, Kevin Wurster and Tahir Jalilov'
__email__ = '[email protected]'
# and [email protected], [email protected]
Expand Down
178 changes: 152 additions & 26 deletions emoji/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
__all__ = [
'emojize', 'demojize', 'get_emoji_regexp',
'emoji_lis', 'distinct_emoji_lis', 'emoji_count',
'replace_emoji', 'is_emoji',
'replace_emoji', 'is_emoji', 'version',
]

PY2 = sys.version_info[0] == 2
Expand All @@ -32,6 +32,8 @@ def emojize(
delimiters=(_DEFAULT_DELIMITER, _DEFAULT_DELIMITER),
variant=None,
language='en',
version=None,
handle_version=None
):
"""Replace emoji names in a string with unicode codes.
Expand All @@ -40,6 +42,22 @@ def emojize(
:param delimiters: (optional) Use delimiters other than _DEFAULT_DELIMITER
:param variant: (optional) Choose variation selector between "base"(None), VS-15 ("text_type") and VS-16 ("emoji_type")
:param language: Choose language of emoji name
:param version: (optional) Max version. If set to an Emoji Version,
all emoji above this version will be ignored.
:param handle_version: (optional) Replace the emoji above ``version``
instead of ignoring it. handle_version can be either a string or a
callable; If it is a callable, it's passed the unicode emoji and the
data dict from emoji.EMOJI_DATA and must return a replacement string
to be used. handle_version(emj: str, data: dict) -> str
data = {
'en' : ':airplane:',
'status' : fully_qualified,
'E' : 0.6,
'de': u':flugzeug:',
'es': u':avión:',
...
}
:raises ValueError: if ``variant`` is neither None, 'text_type' or 'emoji_type'
>>> import emoji
>>> print(emoji.emojize("Python is fun :thumbsup:", use_aliases=True))
Python is fun 👍
Expand All @@ -60,15 +78,34 @@ def replace(match):
delimiters[1], _DEFAULT_DELIMITER
)
if use_aliases:
emj = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH.get(mg, mg)
emj = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH.get(mg)
else:
emj = EMOJI_UNICODE.get(mg, mg)
if variant is None:
emj = EMOJI_UNICODE.get(mg)

if emj is None:
return mg

if version is not None:
if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
if callable(handle_version):
return handle_version(emj, unicode_codes.EMOJI_DATA[emj])
elif handle_version is not None:
return str(handle_version)
else:
return ''

if variant is None or 'variant' not in unicode_codes.EMOJI_DATA[emj]:
return emj
elif variant == "text_type":

if emj[-1] == u'\uFE0E' or emj[-1] == u'\uFE0F':
# Remove an existing variant
emj = emj[0:-1]
if variant == "text_type":
return emj + u'\uFE0E'
elif variant == "emoji_type":
return emj + u'\uFE0F'
else:
raise ValueError("Parameter 'variant' must be either None, 'text_type' or 'emoji_type'")

return pattern.sub(replace, string)

Expand All @@ -78,12 +115,29 @@ def demojize(
use_aliases=False,
delimiters=(_DEFAULT_DELIMITER, _DEFAULT_DELIMITER),
language='en',
version=None,
handle_version=None
):
"""Replace unicode emoji in a string with emoji shortcodes. Useful for storage.
:param string: String contains unicode characters. MUST BE UNICODE.
:param use_aliases: (optional) Return emoji aliases. See ``emoji.UNICODE_EMOJI_ALIAS``.
:param delimiters: (optional) User delimiters other than _DEFAULT_DELIMITER
:param language: Choose language of emoji name
:param language: (optional) Choose language of emoji name
:param version: (optional) Max version. If set to an Emoji Version,
all emoji above this version will be removed.
:param handle_version: (optional) Replace the emoji above ``version``
instead of removing it. handle_version can be either a string or a
callable; If it is a callable, it's passed the unicode emoji and the
data dict from emoji.EMOJI_DATA and must return a replacement string
to be used. handle_version(emj: str, data: dict) -> str
data = {
'en' : ':airplane:',
'status' : fully_qualified,
'E' : 0.6,
'de': u':flugzeug:',
'es': u':avión:',
...
}
>>> import emoji
>>> print(emoji.emojize("Python is fun :thumbs_up:"))
Python is fun 👍
Expand All @@ -92,48 +146,82 @@ def demojize(
>>> print(emoji.demojize(u"Unicode is tricky 😯", delimiters=("__", "__")))
Unicode is tricky __hushed_face__
"""
UNICODE_EMOJI = unicode_codes.UNICODE_EMOJI[language]

codes_dict = unicode_codes.UNICODE_EMOJI_ALIAS_ENGLISH if use_aliases else unicode_codes.UNICODE_EMOJI[language]

def replace(match):
codes_dict = unicode_codes.UNICODE_EMOJI_ALIAS_ENGLISH if use_aliases else UNICODE_EMOJI
val = codes_dict.get(match.group(0), match.group(0))
emj = match.group(0)
val = codes_dict.get(emj)
if val is None:
return emj
if version is not None:
if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
if callable(handle_version):
return handle_version(emj, unicode_codes.EMOJI_DATA[emj])
elif handle_version is not None:
return str(handle_version)
else:
return ''
return delimiters[0] + val[1:-1] + delimiters[1]

return re.sub(u'\ufe0f', '', (get_emoji_regexp(language).sub(replace, string)))
return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')


def replace_emoji(string, replace='', language='en', ):
def replace_emoji(string, replace='', language=None, version=-1):
"""Replace unicode emoji in a customizable string.
:param string: String contains unicode characters. MUST BE UNICODE.
:param replace: (optional) replace can be either a string or a callable;
If it is a callable, it's passed the unicode emoji and the data dict from
emoji.EMOJI_DATA and must return a replacement string to be used.
replace(str, dict) -> str
:param version: (optional) Max version. If set to an Emoji Version,
only emoji above this version will be replaced.
:param language: (optional) Parameter is no longer used
"""
return re.sub(u'\ufe0f', '', (get_emoji_regexp(language).sub(replace, string)))

if version <= 0 and not callable(replace):
return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')

def get_emoji_regexp(language='en'):
"""Returns compiled regular expression that matches emojis defined in
``emoji.UNICODE_EMOJI_ALIAS``. The regular expression is only compiled once.
def replace_fct(match):
emj = match.group(0)

if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
if callable(replace):
return replace(emj, unicode_codes.EMOJI_DATA[emj])
else:
return str(replace)
return emj

return get_emoji_regexp().sub(replace_fct, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')


def get_emoji_regexp(language=None):
"""Returns compiled regular expression that matches all emojis defined in
``emoji.EMOJI_DATA``. The regular expression is only compiled once.
:param language: (optional) Parameter is no longer used
"""

global _EMOJI_REGEXP
# Build emoji regexp once
EMOJI_UNICODE = unicode_codes.EMOJI_UNICODE[language]
if _EMOJI_REGEXP is None:
# Sort emojis by length to make sure multi-character emojis are
# matched first
emojis = sorted(EMOJI_UNICODE.values(), key=len, reverse=True)
emojis = sorted(unicode_codes.EMOJI_DATA, key=len, reverse=True)
pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
_EMOJI_REGEXP = re.compile(pattern)
return _EMOJI_REGEXP


def emoji_lis(string, language='en'):
def emoji_lis(string, language=None):
"""
Returns the location and emoji in list of dict format.
>>> emoji.emoji_lis("Hi, I am fine. 😁")
>>> [{'location': 15, 'emoji': '😁'}]
:param language: (optional) Parameter is no longer used
"""
_entities = []

for match in get_emoji_regexp(language).finditer(string):
for match in get_emoji_regexp().finditer(string):
_entities.append({
'location': match.start(),
'emoji': match.group(),
Expand All @@ -142,10 +230,12 @@ def emoji_lis(string, language='en'):
return _entities


def distinct_emoji_lis(string, language='en'):
"""Returns distinct list of emojis from the string."""
def distinct_emoji_lis(string, language=None):
"""Returns distinct list of emojis from the string.
:param language: (optional) Parameter is no longer used
"""
distinct_list = list(
{e['emoji'] for e in emoji_lis(string, language)}
{e['emoji'] for e in emoji_lis(string)}
)
return distinct_list

Expand All @@ -157,7 +247,43 @@ def emoji_count(string):

def is_emoji(string):
"""Returns True if the string is an emoji"""
return string in unicode_codes.UNICODE_EMOJI['en'] or \
string in unicode_codes.UNICODE_EMOJI['es'] or \
string in unicode_codes.UNICODE_EMOJI['it'] or \
string in unicode_codes.UNICODE_EMOJI['pt']
return string in unicode_codes.EMOJI_DATA


def version(string):
"""Returns the Emoji Version of the emoji.
See http://www.unicode.org/reports/tr51/#Versioning for more information.
>>> emoji.version("😁")
>>> 0.6
>>> emoji.version(":butterfly:")
>>> 3
:param string: An emoji or a text containig an emoji
:raises ValueError: if ``string`` does not contain an emoji
"""

# Try dictionary lookup
if string in unicode_codes.EMOJI_DATA:
return unicode_codes.EMOJI_DATA[string]['E']

if string in unicode_codes.EMOJI_UNICODE['en']:
emj_code = unicode_codes.EMOJI_UNICODE['en'][string]
if emj_code in unicode_codes.EMOJI_DATA:
return unicode_codes.EMOJI_DATA[emj_code]['E']

# Try to find first emoji in string
version = []
def f(e, emoji_data):
version.append(emoji_data['E'])
return ''
replace_emoji(string, replace=f, version=-1)
if version:
return version[0]
emojize(string, use_aliases=True, version=-1, handle_version=f)
if version:
return version[0]
for lang_code in unicode_codes.EMOJI_UNICODE:
emojize(string, language=lang_code, version=-1, handle_version=f)
if version:
return version[0]

raise ValueError("No emoji found in string")
41 changes: 34 additions & 7 deletions emoji/unicode_codes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
# -*- coding: utf-8 -*-

from emoji.unicode_codes.en import *
from emoji.unicode_codes.es import *
from emoji.unicode_codes.pt import *
from emoji.unicode_codes.it import *
from emoji.unicode_codes.fr import *
from emoji.unicode_codes.de import *
from emoji.unicode_codes.data_dict import *


__all__ = [
Expand All @@ -16,10 +11,42 @@
'EMOJI_UNICODE_PORTUGUESE', 'UNICODE_EMOJI_PORTUGUESE',
'EMOJI_UNICODE_ITALIAN', 'UNICODE_EMOJI_ITALIAN',
'EMOJI_UNICODE_FRENCH', 'UNICODE_EMOJI_FRENCH',
'EMOJI_UNICODE_GERMAN', 'UNICODE_EMOJI_GERMAN'
'EMOJI_UNICODE_GERMAN', 'UNICODE_EMOJI_GERMAN',
'EMOJI_DATA', 'STATUS'
]


def get_emoji_unicode_dict(lang):
""" Get the EMOJI_UNICODE_{language} dict containing all fully-qualified and component emoji"""
return {data[lang]: emj for emj, data in EMOJI_DATA.items() if lang in data and data['status'] <= STATUS['fully_qualified']}


def get_unicode_emoji_dict(lang):
""" Get the UNICODE_EMOJI_{language} dict containing all emoji that have a name in {lang}"""
return {emj: data[lang] for emj, data in EMOJI_DATA.items() if lang in data}


EMOJI_UNICODE_ENGLISH = get_emoji_unicode_dict('en')
UNICODE_EMOJI_ENGLISH = get_unicode_emoji_dict('en')

EMOJI_ALIAS_UNICODE_ENGLISH = dict(EMOJI_UNICODE_ENGLISH.items(), **get_emoji_unicode_dict('alias'))
UNICODE_EMOJI_ALIAS_ENGLISH = dict(UNICODE_EMOJI_ENGLISH.items(), **get_unicode_emoji_dict('alias'))

EMOJI_UNICODE_GERMAN = get_emoji_unicode_dict('de')
UNICODE_EMOJI_GERMAN = get_unicode_emoji_dict('de')

EMOJI_UNICODE_SPANISH = get_emoji_unicode_dict('es')
UNICODE_EMOJI_SPANISH = get_unicode_emoji_dict('es')

EMOJI_UNICODE_FRENCH = get_emoji_unicode_dict('fr')
UNICODE_EMOJI_FRENCH = get_unicode_emoji_dict('fr')

EMOJI_UNICODE_ITALIAN = get_emoji_unicode_dict('it')
UNICODE_EMOJI_ITALIAN = get_unicode_emoji_dict('it')

EMOJI_UNICODE_PORTUGUESE = get_emoji_unicode_dict('pt')
UNICODE_EMOJI_PORTUGUESE = get_unicode_emoji_dict('pt')

EMOJI_UNICODE = {
'en': EMOJI_UNICODE_ENGLISH,
'es': EMOJI_UNICODE_SPANISH,
Expand Down
Loading

0 comments on commit 36e2419

Please sign in to comment.