Merge pull request #189 from cvzi/versions

Emoji Versions
carpedm20 · Oct 4, 2021 · 36e2419 · 36e2419
2 parents 57f010a + 250073a
commit 36e2419
Show file tree

Hide file tree

Showing 17 changed files with 43,234 additions and 21,658 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,6 +1,15 @@
 emoji
 =====
 
+1.6.0
+-----
+* Fix Unicode of some emoji in the language files
+* is_emoji function added
+* Added dict of dict with emoji data include emoji versions and statuses
+* emoji.version(string) method added
+* Included 'variant' in the dict of dicts
+
+
 1.5.0
 -----
 * Emojis of English version updated to the Emoji Charts v14.0

diff --git a/emoji/__init__.py b/emoji/__init__.py
@@ -20,16 +20,17 @@
 
 __all__ = [
     # emoji.core
-    'emojize', 'demojize', 'get_emoji_regexp', 'emoji_count', 'emoji_lis', 'replace_emoji',
+    'emojize', 'demojize', 'get_emoji_regexp', 'emoji_count', 'emoji_lis',
+    'replace_emoji', 'version',
     # emoji.unicode_codes
     'EMOJI_UNICODE_ENGLISH', 'EMOJI_UNICODE_SPANISH', 'EMOJI_UNICODE_PORTUGUESE',
     'EMOJI_UNICODE_ITALIAN', 'EMOJI_UNICODE_FRENCH', 'EMOJI_UNICODE_GERMAN',
     'UNICODE_EMOJI_ENGLISH', 'UNICODE_EMOJI_SPANISH', 'UNICODE_EMOJI_PORTUGUESE',
     'UNICODE_EMOJI_ITALIAN', 'UNICODE_EMOJI_FRENCH', 'UNICODE_EMOJI_GERMAN',
-    'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH',
+    'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH', 'EMOJI_DATA',
 ]
 
-__version__ = '1.5.0'
+__version__ = '1.6.0'
 __author__ = 'Taehoon Kim, Kevin Wurster and Tahir Jalilov'
 __email__ = '[email protected]'
 # and [email protected], [email protected]

diff --git a/emoji/core.py b/emoji/core.py
@@ -17,7 +17,7 @@
 __all__ = [
     'emojize', 'demojize', 'get_emoji_regexp',
     'emoji_lis', 'distinct_emoji_lis', 'emoji_count',
-    'replace_emoji', 'is_emoji',
+    'replace_emoji', 'is_emoji', 'version',
 ]
 
 PY2 = sys.version_info[0] == 2
@@ -32,6 +32,8 @@ def emojize(
         delimiters=(_DEFAULT_DELIMITER, _DEFAULT_DELIMITER),
         variant=None,
         language='en',
+        version=None,
+        handle_version=None
 ):
     """Replace emoji names in a string with unicode codes.
 
@@ -40,6 +42,22 @@ def emojize(
     :param delimiters: (optional) Use delimiters other than _DEFAULT_DELIMITER
     :param variant: (optional) Choose variation selector between "base"(None), VS-15 ("text_type") and VS-16 ("emoji_type")
     :param language: Choose language of emoji name
+    :param version: (optional) Max version. If set to an Emoji Version,
+        all emoji above this version will be ignored.
+    :param handle_version: (optional) Replace the emoji above ``version``
+        instead of ignoring it. handle_version can be either a string or a
+        callable; If it is a callable, it's passed the unicode emoji and the
+        data dict from emoji.EMOJI_DATA and must return a replacement string
+        to be used. handle_version(emj: str, data: dict) -> str
+        data = {
+            'en' : ':airplane:',
+            'status' : fully_qualified,
+            'E' : 0.6,
+            'de': u':flugzeug:',
+            'es': u':avión:',
+            ...
+        }
+    :raises ValueError: if ``variant`` is neither None, 'text_type' or 'emoji_type'
         >>> import emoji
         >>> print(emoji.emojize("Python is fun :thumbsup:", use_aliases=True))
         Python is fun 👍
@@ -60,15 +78,34 @@ def replace(match):
             delimiters[1], _DEFAULT_DELIMITER
         )
         if use_aliases:
-            emj = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH.get(mg, mg)
+            emj = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH.get(mg)
         else:
-            emj = EMOJI_UNICODE.get(mg, mg)
-        if variant is None:
+            emj = EMOJI_UNICODE.get(mg)
+
+        if emj is None:
+            return mg
+
+        if version is not None:
+            if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
+                if callable(handle_version):
+                    return handle_version(emj, unicode_codes.EMOJI_DATA[emj])
+                elif handle_version is not None:
+                    return str(handle_version)
+                else:
+                    return ''
+
+        if variant is None or 'variant' not in unicode_codes.EMOJI_DATA[emj]:
             return emj
-        elif variant == "text_type":
+
+        if emj[-1] == u'\uFE0E' or emj[-1] == u'\uFE0F':
+            # Remove an existing variant
+            emj = emj[0:-1]
+        if variant == "text_type":
             return emj + u'\uFE0E'
         elif variant == "emoji_type":
             return emj + u'\uFE0F'
+        else:
+            raise ValueError("Parameter 'variant' must be either None, 'text_type' or 'emoji_type'")
 
     return pattern.sub(replace, string)
 
@@ -78,12 +115,29 @@ def demojize(
         use_aliases=False,
         delimiters=(_DEFAULT_DELIMITER, _DEFAULT_DELIMITER),
         language='en',
+        version=None,
+        handle_version=None
 ):
     """Replace unicode emoji in a string with emoji shortcodes. Useful for storage.
     :param string: String contains unicode characters. MUST BE UNICODE.
     :param use_aliases: (optional) Return emoji aliases.  See ``emoji.UNICODE_EMOJI_ALIAS``.
     :param delimiters: (optional) User delimiters other than _DEFAULT_DELIMITER
-    :param language: Choose language of emoji name
+    :param language: (optional) Choose language of emoji name
+    :param version: (optional) Max version. If set to an Emoji Version,
+        all emoji above this version will be removed.
+    :param handle_version: (optional) Replace the emoji above ``version``
+        instead of removing it. handle_version can be either a string or a
+        callable; If it is a callable, it's passed the unicode emoji and the
+        data dict from emoji.EMOJI_DATA and must return a replacement string
+        to be used. handle_version(emj: str, data: dict) -> str
+        data = {
+            'en' : ':airplane:',
+            'status' : fully_qualified,
+            'E' : 0.6,
+            'de': u':flugzeug:',
+            'es': u':avión:',
+            ...
+        }
         >>> import emoji
         >>> print(emoji.emojize("Python is fun :thumbs_up:"))
         Python is fun 👍
@@ -92,48 +146,82 @@ def demojize(
         >>> print(emoji.demojize(u"Unicode is tricky 😯", delimiters=("__", "__")))
         Unicode is tricky __hushed_face__
     """
-    UNICODE_EMOJI = unicode_codes.UNICODE_EMOJI[language]
+
+    codes_dict = unicode_codes.UNICODE_EMOJI_ALIAS_ENGLISH if use_aliases else unicode_codes.UNICODE_EMOJI[language]
 
     def replace(match):
-        codes_dict = unicode_codes.UNICODE_EMOJI_ALIAS_ENGLISH if use_aliases else UNICODE_EMOJI
-        val = codes_dict.get(match.group(0), match.group(0))
+        emj = match.group(0)
+        val = codes_dict.get(emj)
+        if val is None:
+            return emj
+        if version is not None:
+            if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
+                if callable(handle_version):
+                    return handle_version(emj, unicode_codes.EMOJI_DATA[emj])
+                elif handle_version is not None:
+                    return str(handle_version)
+                else:
+                    return ''
         return delimiters[0] + val[1:-1] + delimiters[1]
 
-    return re.sub(u'\ufe0f', '', (get_emoji_regexp(language).sub(replace, string)))
+    return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')
 
 
-def replace_emoji(string, replace='', language='en', ):
+def replace_emoji(string, replace='', language=None, version=-1):
     """Replace unicode emoji in a customizable string.
+    :param string: String contains unicode characters. MUST BE UNICODE.
+    :param replace: (optional) replace can be either a string or a callable;
+        If it is a callable, it's passed the unicode emoji and the data dict from
+        emoji.EMOJI_DATA and must return a replacement string to be used.
+        replace(str, dict) -> str
+    :param version: (optional) Max version. If set to an Emoji Version,
+        only emoji above this version will be replaced.
+    :param language: (optional) Parameter is no longer used
     """
-    return re.sub(u'\ufe0f', '', (get_emoji_regexp(language).sub(replace, string)))
 
+    if version <= 0 and not callable(replace):
+        return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')
 
-def get_emoji_regexp(language='en'):
-    """Returns compiled regular expression that matches emojis defined in
-    ``emoji.UNICODE_EMOJI_ALIAS``. The regular expression is only compiled once.
+    def replace_fct(match):
+        emj = match.group(0)
+
+        if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
+            if callable(replace):
+                return replace(emj, unicode_codes.EMOJI_DATA[emj])
+            else:
+                return str(replace)
+        return emj
+
+    return get_emoji_regexp().sub(replace_fct, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')
+
+
+def get_emoji_regexp(language=None):
+    """Returns compiled regular expression that matches all emojis defined in
+    ``emoji.EMOJI_DATA``. The regular expression is only compiled once.
+    :param language: (optional) Parameter is no longer used
     """
 
     global _EMOJI_REGEXP
     # Build emoji regexp once
-    EMOJI_UNICODE = unicode_codes.EMOJI_UNICODE[language]
     if _EMOJI_REGEXP is None:
         # Sort emojis by length to make sure multi-character emojis are
         # matched first
-        emojis = sorted(EMOJI_UNICODE.values(), key=len, reverse=True)
+        emojis = sorted(unicode_codes.EMOJI_DATA, key=len, reverse=True)
         pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
         _EMOJI_REGEXP = re.compile(pattern)
     return _EMOJI_REGEXP
 
 
-def emoji_lis(string, language='en'):
+def emoji_lis(string, language=None):
     """
     Returns the location and emoji in list of dict format.
     >>> emoji.emoji_lis("Hi, I am fine. 😁")
     >>> [{'location': 15, 'emoji': '😁'}]
+    :param language: (optional) Parameter is no longer used
     """
     _entities = []
 
-    for match in get_emoji_regexp(language).finditer(string):
+    for match in get_emoji_regexp().finditer(string):
         _entities.append({
             'location': match.start(),
             'emoji': match.group(),
@@ -142,10 +230,12 @@ def emoji_lis(string, language='en'):
     return _entities
 
 
-def distinct_emoji_lis(string, language='en'):
-    """Returns distinct list of emojis from the string."""
+def distinct_emoji_lis(string, language=None):
+    """Returns distinct list of emojis from the string.
+    :param language: (optional) Parameter is no longer used
+    """
     distinct_list = list(
-        {e['emoji'] for e in emoji_lis(string, language)}
+        {e['emoji'] for e in emoji_lis(string)}
     )
     return distinct_list
 
@@ -157,7 +247,43 @@ def emoji_count(string):
 
 def is_emoji(string):
     """Returns True if the string is an emoji"""
-    return string in unicode_codes.UNICODE_EMOJI['en'] or \
-           string in unicode_codes.UNICODE_EMOJI['es'] or \
-           string in unicode_codes.UNICODE_EMOJI['it'] or \
-           string in unicode_codes.UNICODE_EMOJI['pt']
+    return string in unicode_codes.EMOJI_DATA
+
+
+def version(string):
+    """Returns the Emoji Version of the emoji.
+    See http://www.unicode.org/reports/tr51/#Versioning for more information.
+    >>> emoji.version("😁")
+    >>> 0.6
+    >>> emoji.version(":butterfly:")
+    >>> 3
+    :param string: An emoji or a text containig an emoji
+    :raises ValueError: if ``string`` does not contain an emoji
+    """
+
+    # Try dictionary lookup
+    if string in unicode_codes.EMOJI_DATA:
+        return unicode_codes.EMOJI_DATA[string]['E']
+
+    if string in unicode_codes.EMOJI_UNICODE['en']:
+        emj_code = unicode_codes.EMOJI_UNICODE['en'][string]
+        if emj_code in unicode_codes.EMOJI_DATA:
+            return unicode_codes.EMOJI_DATA[emj_code]['E']
+
+    # Try to find first emoji in string
+    version = []
+    def f(e, emoji_data):
+        version.append(emoji_data['E'])
+        return ''
+    replace_emoji(string, replace=f, version=-1)
+    if version:
+        return version[0]
+    emojize(string, use_aliases=True, version=-1, handle_version=f)
+    if version:
+        return version[0]
+    for lang_code in unicode_codes.EMOJI_UNICODE:
+        emojize(string, language=lang_code, version=-1, handle_version=f)
+        if version:
+            return version[0]
+
+    raise ValueError("No emoji found in string")
diff --git a/emoji/unicode_codes/__init__.py b/emoji/unicode_codes/__init__.py
@@ -1,11 +1,6 @@
 # -*- coding: utf-8 -*-
 
-from emoji.unicode_codes.en import *
-from emoji.unicode_codes.es import *
-from emoji.unicode_codes.pt import *
-from emoji.unicode_codes.it import *
-from emoji.unicode_codes.fr import *
-from emoji.unicode_codes.de import *
+from emoji.unicode_codes.data_dict import *
 
 
 __all__ = [
@@ -16,10 +11,42 @@
     'EMOJI_UNICODE_PORTUGUESE', 'UNICODE_EMOJI_PORTUGUESE',
     'EMOJI_UNICODE_ITALIAN', 'UNICODE_EMOJI_ITALIAN',
     'EMOJI_UNICODE_FRENCH', 'UNICODE_EMOJI_FRENCH',
-    'EMOJI_UNICODE_GERMAN', 'UNICODE_EMOJI_GERMAN'
+    'EMOJI_UNICODE_GERMAN', 'UNICODE_EMOJI_GERMAN',
+    'EMOJI_DATA', 'STATUS'
 ]
 
 
+def get_emoji_unicode_dict(lang):
+    """ Get the EMOJI_UNICODE_{language} dict containing all fully-qualified and component emoji"""
+    return {data[lang]: emj for emj, data in EMOJI_DATA.items() if lang in data and data['status'] <= STATUS['fully_qualified']}
+
+
+def get_unicode_emoji_dict(lang):
+    """ Get the UNICODE_EMOJI_{language} dict containing all emoji that have a name in {lang}"""
+    return {emj: data[lang] for emj, data in EMOJI_DATA.items() if lang in data}
+
+
+EMOJI_UNICODE_ENGLISH = get_emoji_unicode_dict('en')
+UNICODE_EMOJI_ENGLISH = get_unicode_emoji_dict('en')
+
+EMOJI_ALIAS_UNICODE_ENGLISH = dict(EMOJI_UNICODE_ENGLISH.items(), **get_emoji_unicode_dict('alias'))
+UNICODE_EMOJI_ALIAS_ENGLISH = dict(UNICODE_EMOJI_ENGLISH.items(), **get_unicode_emoji_dict('alias'))
+
+EMOJI_UNICODE_GERMAN = get_emoji_unicode_dict('de')
+UNICODE_EMOJI_GERMAN = get_unicode_emoji_dict('de')
+
+EMOJI_UNICODE_SPANISH = get_emoji_unicode_dict('es')
+UNICODE_EMOJI_SPANISH = get_unicode_emoji_dict('es')
+
+EMOJI_UNICODE_FRENCH = get_emoji_unicode_dict('fr')
+UNICODE_EMOJI_FRENCH = get_unicode_emoji_dict('fr')
+
+EMOJI_UNICODE_ITALIAN = get_emoji_unicode_dict('it')
+UNICODE_EMOJI_ITALIAN = get_unicode_emoji_dict('it')
+
+EMOJI_UNICODE_PORTUGUESE = get_emoji_unicode_dict('pt')
+UNICODE_EMOJI_PORTUGUESE = get_unicode_emoji_dict('pt')
+
 EMOJI_UNICODE = {
     'en': EMOJI_UNICODE_ENGLISH,
     'es': EMOJI_UNICODE_SPANISH,