From 94ec1fc272295b3b7f73cb178703a0f67c7fd5d0 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 6 Dec 2023 12:10:08 +0000
Subject: [PATCH] Rename corpus function names for consistency

- Add `get_corpus_as_is()` which is the former `get_corpus(as_is=True)` to not mixing the return type
- Fix return type of `provinces()`
- Add type hints
- Add license info
---
 docs/api/corpus.rst                           |   9 +-
 pythainlp/corpus/__init__.py                  |  12 +-
 pythainlp/corpus/common.py                    |  83 ++++++++------
 pythainlp/corpus/core.py                      | 105 ++++++++++--------
 pythainlp/corpus/icu.py                       |   2 +-
 ...{thai_orst_words.txt => orst_words_th.txt} |   0
 pythainlp/corpus/volubilis.py                 |  28 ++---
 ...is_modified.txt => volubilis_words_th.txt} |   2 +
 .../{wikipedia_titles.py => wikipedia.py}     |  11 +-
 ...dia_titles.txt => wikipedia_titles_th.txt} |   2 +
 10 files changed, 148 insertions(+), 106 deletions(-)
 rename pythainlp/corpus/{thai_orst_words.txt => orst_words_th.txt} (100%)
 rename pythainlp/corpus/{volubilis_modified.txt => volubilis_words_th.txt} (99%)
 rename pythainlp/corpus/{wikipedia_titles.py => wikipedia.py} (71%)
 rename pythainlp/corpus/{wikipedia_titles.txt => wikipedia_titles_th.txt} (99%)

diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst
index fbd25822d..ddb81e8f6 100644
--- a/docs/api/corpus.rst
+++ b/docs/api/corpus.rst
@@ -17,6 +17,11 @@ get_corpus
 .. autofunction:: get_corpus
    :noindex:
 
+get_corpus_as_is
+~~~~~~~~~~
+.. autofunction:: get_corpus_as_is
+   :noindex:
+
 get_corpus_db
 ~~~~~~~~~~~~~~
 .. autofunction:: get_corpus_db
@@ -77,9 +82,9 @@ thai_orst_words
 .. autofunction:: thai_orst_words
    :noindex:
 
-thai_synonym
+thai_synonyms
 ~~~~~~~~~~~~~~
-.. autofunction:: thai_synonym
+.. autofunction:: thai_synonyms
    :noindex:
 
 thai_syllables
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
index 7a087f603..7fff98d85 100644
--- a/pythainlp/corpus/__init__.py
+++ b/pythainlp/corpus/__init__.py
@@ -15,6 +15,7 @@
     "countries",
     "download",
     "get_corpus",
+    "get_corpus_as_is",
     "get_corpus_db",
     "get_corpus_db_detail",
     "get_corpus_default_db",
@@ -33,10 +34,11 @@
     "thai_stopwords",
     "thai_syllables",
     "thai_synonym",
+    "thai_synonyms",
+    "thai_volubilis_words",
+    "thai_wikipedia_titles",
     "thai_words",
     "thai_wsd_dict",
-    "volubilis",
-    "wikipedia_titles",
 ]
 
 import os
@@ -88,6 +90,7 @@ def corpus_db_path() -> str:
 from pythainlp.corpus.core import (
     download,
     get_corpus,
+    get_corpus_as_is,
     get_corpus_db,
     get_corpus_db_detail,
     get_corpus_default_db,
@@ -108,9 +111,10 @@ def corpus_db_path() -> str:
     thai_stopwords,
     thai_syllables,
     thai_synonym,
+    thai_synonyms,
     thai_words,
     thai_wsd_dict,
 )
 from pythainlp.corpus.icu import thai_icu_words
-from pythainlp.corpus.volubilis import volubilis
-from pythainlp.corpus.wikipedia_titles import wikipedia_titles
+from pythainlp.corpus.volubilis import thai_volubilis_words
+from pythainlp.corpus.wikipedia import thai_wikipedia_titles
diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py
index 35b4c8ed7..36a8c718c 100644
--- a/pythainlp/corpus/common.py
+++ b/pythainlp/corpus/common.py
@@ -12,49 +12,51 @@
     "thai_female_names",
     "thai_male_names",
     "thai_negations",
+    "thai_dict",
     "thai_stopwords",
     "thai_syllables",
+    "thai_synonym",
+    "thai_synonyms",
     "thai_words",
-    "thai_dict",
     "thai_wsd_dict",
-    "thai_synonym",
 ]
 
 from typing import FrozenSet, List, Union
+import warnings
 
-from pythainlp.corpus import get_corpus, get_corpus_path
+from pythainlp.corpus import get_corpus, get_corpus_as_is, get_corpus_path
 
-_THAI_COUNTRIES = set()
+_THAI_COUNTRIES: FrozenSet[str] = frozenset()
 _THAI_COUNTRIES_FILENAME = "countries_th.txt"
 
-_THAI_THAILAND_PROVINCES = set()
-_THAI_THAILAND_PROVINCES_DETAILS = []
+_THAI_THAILAND_PROVINCES: FrozenSet[str] = frozenset()
+_THAI_THAILAND_PROVINCES_DETAILS: List[dict] = []
 _THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv"
 
-_THAI_SYLLABLES = set()
+_THAI_SYLLABLES: FrozenSet[str] = frozenset()
 _THAI_SYLLABLES_FILENAME = "syllables_th.txt"
 
-_THAI_WORDS = set()
+_THAI_WORDS: FrozenSet[str] = frozenset()
 _THAI_WORDS_FILENAME = "words_th.txt"
 
-_THAI_STOPWORDS = set()
+_THAI_STOPWORDS: FrozenSet[str] = frozenset()
 _THAI_STOPWORDS_FILENAME = "stopwords_th.txt"
 
-_THAI_NEGATIONS = set()
+_THAI_NEGATIONS: FrozenSet[str] = frozenset()
 _THAI_NEGATIONS_FILENAME = "negations_th.txt"
 
-_THAI_FAMLIY_NAMES = set()
+_THAI_FAMLIY_NAMES: FrozenSet[str] = frozenset()
 _THAI_FAMLIY_NAMES_FILENAME = "family_names_th.txt"
-_THAI_FEMALE_NAMES = set()
+_THAI_FEMALE_NAMES: FrozenSet[str] = frozenset()
 _THAI_FEMALE_NAMES_FILENAME = "person_names_female_th.txt"
-_THAI_MALE_NAMES = set()
+_THAI_MALE_NAMES: FrozenSet[str] = frozenset()
 _THAI_MALE_NAMES_FILENAME = "person_names_male_th.txt"
 
-_THAI_ORST_WORDS = set()
+_THAI_ORST_WORDS: FrozenSet[str] = frozenset()
 
 _THAI_DICT = {}
 _THAI_WSD_DICT = {}
-_THAI_SYNONYM = None
+_THAI_SYNONYMS = {}
 
 
 def countries() -> FrozenSet[str]:
@@ -74,7 +76,7 @@ def countries() -> FrozenSet[str]:
     return _THAI_COUNTRIES
 
 
-def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
+def provinces(details: bool = False) -> Union[FrozenSet[str], List[dict]]:
     """
     Return a frozenset of Thailand province names in Thai such as "กระบี่",
     "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
@@ -96,7 +98,7 @@ def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
         provs = set()
         prov_details = []
 
-        for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True):
+        for line in get_corpus_as_is(_THAI_THAILAND_PROVINCES_FILENAME):
             p = line.split(",")
 
             prov = {}
@@ -155,14 +157,14 @@ def thai_orst_words() -> FrozenSet[str]:
     """
     Return a frozenset of Thai words from Royal Society of Thailand
     \n(See: `dev/pythainlp/corpus/thai_orst_words.txt\
-    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thai_orst_words>`_)
+    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/orst_words_th.txt>`_)
 
     :return: :class:`frozenset` containing words in the Thai language.
     :rtype: :class:`frozenset`
     """
     global _THAI_ORST_WORDS
     if not _THAI_ORST_WORDS:
-        _THAI_ORST_WORDS = get_corpus("thai_orst_words.txt")
+        _THAI_ORST_WORDS = get_corpus("orst_words_th.txt")
 
     return _THAI_ORST_WORDS
 
@@ -266,8 +268,11 @@ def thai_dict() -> dict:
     global _THAI_DICT
     if not _THAI_DICT:
         import csv
-        _THAI_DICT = {"word":[], "meaning":[]}
-        with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile:
+
+        _THAI_DICT = {"word": [], "meaning": []}
+        with open(
+            get_corpus_path("thai_dict"), newline="\n", encoding="utf-8"
+        ) as csvfile:
             reader = csv.DictReader(csvfile, delimiter=",")
             for row in reader:
                 _THAI_DICT["word"].append(row["word"])
@@ -288,38 +293,46 @@ def thai_wsd_dict() -> dict:
     global _THAI_WSD_DICT
     if not _THAI_WSD_DICT:
         _thai_wsd = thai_dict()
-        _THAI_WSD_DICT = {"word":[],"meaning":[]}
-        for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
+        _THAI_WSD_DICT = {"word": [], "meaning": []}
+        for i, j in zip(_thai_wsd["word"], _thai_wsd["meaning"]):
             _all_value = list(eval(j).values())
             _use = []
             for k in _all_value:
                 _use.extend(k)
-            _use=list(set(_use))
-            if len(_use)>1:
+            _use = list(set(_use))
+            if len(_use) > 1:
                 _THAI_WSD_DICT["word"].append(i)
                 _THAI_WSD_DICT["meaning"].append(_use)
 
     return _THAI_WSD_DICT
 
 
-def thai_synonym() -> dict:
+def thai_synonyms() -> dict:
     """
-    Return Thai synonym.
+    Return Thai synonyms.
     \n(See: `thai_synonym\
     <https://pythainlp.github.io/pythainlp-corpus/thai_synonym.html>`_)
 
     :return: Thai words with part-of-speech type and synonym
     :rtype: dict
     """
-    global _THAI_SYNONYM
-    if _THAI_SYNONYM is None:
+    global _THAI_SYNONYMS
+    if not _THAI_SYNONYMS:
         import csv
-        _THAI_SYNONYM = {"word":[], "pos":[], "synonym":[]}
-        with open(get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8") as csvfile:
+
+        _THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
+        with open(
+            get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8"
+        ) as csvfile:
             reader = csv.DictReader(csvfile, delimiter=",")
             for row in reader:
-                _THAI_SYNONYM["word"].append(row["word"])
-                _THAI_SYNONYM["pos"].append(row["pos"])
-                _THAI_SYNONYM["synonym"].append(row["synonym"].split("|"))
+                _THAI_SYNONYMS["word"].append(row["word"])
+                _THAI_SYNONYMS["pos"].append(row["pos"])
+                _THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))
+
+    return _THAI_SYNONYMS
 
-    return _THAI_SYNONYM
+
+def thai_synonym() -> dict:
+    warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning)
+    return thai_synonyms()
diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
index 50b4004e6..9e73106dc 100644
--- a/pythainlp/corpus/core.py
+++ b/pythainlp/corpus/core.py
@@ -35,7 +35,7 @@ def get_corpus_db(url: str):
     return corpus_db
 
 
-def get_corpus_db_detail(name: str, version: str = None) -> dict:
+def get_corpus_db_detail(name: str, version: str = '') -> dict:
     """
     Get details about a corpus, using information from local catalog.
 
@@ -46,7 +46,7 @@ def get_corpus_db_detail(name: str, version: str = None) -> dict:
     with open(corpus_db_path(), "r", encoding="utf-8-sig") as f:
         local_db = json.load(f)
 
-    if version is None:
+    if not version:
         for corpus in local_db["_default"].values():
             if corpus["name"] == name:
                 return corpus
@@ -70,30 +70,22 @@ def path_pythainlp_corpus(filename: str) -> str:
     return os.path.join(corpus_path(), filename)
 
 
-def get_corpus(filename: str,
-               as_is: bool = False,
-               comments: bool = True
-               ) -> Union[frozenset, list]:
+def get_corpus(filename: str, comments: bool = True) -> frozenset:
     """
-    Read corpus data from file and return a frozenset or a list.
+    Read corpus data from file and return a frozenset.
 
-    Each line in the file will be a member of the set or the list.
+    Each line in the file will be a member of the set.
 
-    By default, a frozenset will be return, with whitespace stripped and
-    empty values and duplicates removed.
-
-    If as_is is True, a list will be return, with no modifications
-    in member values and their orders.
+    Whitespace stripped and empty values and duplicates removed.
 
     If comments is False, any text at any position after the character
     '#' in each line will be discarded.
 
     :param str filename: filename of the corpus to be read
-    :param bool as_is: no modification to the text, and return a list
     :param bool comments: keep comments
 
-    :return: :class:`frozenset` or :class:`list` consisting of lines in the file
-    :rtype: :class:`frozenset` or :class:`list`
+    :return: :class:`frozenset` consisting of lines in the file
+    :rtype: :class:`frozenset`
 
     :Example:
     ::
@@ -108,10 +100,6 @@ def get_corpus(filename: str,
         # output:
         # frozenset({'แต่', 'ไม่'})
 
-        get_corpus("negations_th.txt", as_is=True)
-        # output:
-        # ['แต่', 'ไม่']
-
         # input file (ttc_freq.txt):
         # ตัวบท<tab>10
         # โดยนัยนี้<tab>1
@@ -147,18 +135,49 @@ def get_corpus(filename: str,
         lines = fh.read().splitlines()
 
     if not comments:
-        # take only text before character '#'
-        lines = [line.split("#", 1)[0] for line in lines]
+        # if the line has a '#' character, take only text before the first '#'
+        lines = [line.split("#", 1)[0].strip() for line in lines]
 
-    if as_is:
-        return lines
+    return frozenset(filter(None, lines))
 
-    lines = [line.strip() for line in lines]
 
-    return frozenset(filter(None, lines))
+def get_corpus_as_is(filename: str) -> list:
+    """
+    Read corpus data from file, as it is, and return a list.
+
+    Each line in the file will be a member of the list.
+
+    No modifications in member values and their orders.
+
+    If strip or comment removal is needed, use get_corpus() instead.
+
+    :param str filename: filename of the corpus to be read
+
+    :return: :class:`list` consisting of lines in the file
+    :rtype: :class:`list`
+
+    :Example:
+    ::
+
+        from pythainlp.corpus import get_corpus
 
+        # input file (negations_th.txt):
+        # แต่
+        # ไม่
 
-def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]:
+        get_corpus_as_is("negations_th.txt")
+        # output:
+        # ['แต่', 'ไม่']
+    """
+    path = path_pythainlp_corpus(filename)
+    lines = []
+    with open(path, "r", encoding="utf-8-sig") as fh:
+        lines = fh.read().splitlines()
+
+    return lines
+
+
+def get_corpus_default_db(name: str, version: str = '') -> Union[str, None]:
     """
     Get model path from default_db.json
 
@@ -179,15 +198,17 @@ def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]:
             return path_pythainlp_corpus(
                 corpus_db[name]["versions"][version]["filename"]
             )
-        elif version is None:  # load latest version
+        elif not version:  # load latest version
             version = corpus_db[name]["latest_version"]
             return path_pythainlp_corpus(
                 corpus_db[name]["versions"][version]["filename"]
             )
 
+    return None
+
 
 def get_corpus_path(
-    name: str, version: str = None, force: bool = False
+    name: str, version: str = '', force: bool = False
 ) -> Union[str, None]:
     """
     Get corpus path.
@@ -229,8 +250,9 @@ def get_corpus_path(
         print(get_corpus_path('wiki_lm_lstm'))
         # output: /root/pythainlp-data/thwiki_model_lstm.pth
     """
-    # Customize your corpus path then close the line from lines 164 through 190.
-    _CUSTOMIZE = {
+    from typing import Dict
+
+    _CUSTOMIZE: Dict[str, str] = {
         # "the corpus name":"path"
     }
     if name in list(_CUSTOMIZE):
@@ -379,7 +401,7 @@ def _check_version(cause: str) -> bool:
 
 
 def download(
-    name: str, force: bool = False, url: str = None, version: str = None
+    name: str, force: bool = False, url: str = '', version: str = ''
 ) -> bool:
     """
     Download corpus.
@@ -430,7 +452,7 @@ def download(
 
         corpus = corpus_db[name]
         print("Corpus:", name)
-        if version is None:
+        if not version:
             for v, file in corpus["versions"].items():
                 if _check_version(file["pythainlp_version"]):
                     version = v
@@ -439,10 +461,7 @@ def download(
         if version not in corpus["versions"]:
             print("Not found corpus")
             return False
-        elif (
-            _check_version(corpus["versions"][version]["pythainlp_version"])
-            is False
-        ):
+        elif _check_version(corpus["versions"][version]["pythainlp_version"]) is False:
             print("Versions Corpus not support")
             return False
         corpus_versions = corpus["versions"][version]
@@ -486,9 +505,7 @@ def download(
                 foldername = name + "_" + str(version)
                 if not os.path.exists(get_full_data_path(foldername)):
                     os.mkdir(get_full_data_path(foldername))
-                with zipfile.ZipFile(
-                    get_full_data_path(file_name), "r"
-                ) as zip:
+                with zipfile.ZipFile(get_full_data_path(file_name), "r") as zip:
                     zip.extractall(path=get_full_data_path(foldername))
 
             if found:
@@ -500,9 +517,7 @@ def download(
                 # This awkward behavior is for backward-compatibility with
                 # database files generated previously using TinyDB
                 if local_db["_default"]:
-                    corpus_no = (
-                        max((int(no) for no in local_db["_default"])) + 1
-                    )
+                    corpus_no = max((int(no) for no in local_db["_default"])) + 1
                 else:
                     corpus_no = 1
                 local_db["_default"][str(corpus_no)] = {
@@ -565,9 +580,7 @@ def remove(name: str) -> bool:
         return False
     with open(corpus_db_path(), "r", encoding="utf-8-sig") as f:
         db = json.load(f)
-    data = [
-        corpus for corpus in db["_default"].values() if corpus["name"] == name
-    ]
+    data = [corpus for corpus in db["_default"].values() if corpus["name"] == name]
 
     if data:
         path = get_corpus_path(name)
diff --git a/pythainlp/corpus/icu.py b/pythainlp/corpus/icu.py
index 71a002d4e..e7d0198b7 100644
--- a/pythainlp/corpus/icu.py
+++ b/pythainlp/corpus/icu.py
@@ -17,7 +17,7 @@ def thai_icu_words() -> FrozenSet[str]:
     Return a frozenset of words from the Thai dictionary for BreakIterator of the
     International Components for Unicode (ICU).
 
-    :return: :class:`frozenset` containing `str`
+    :return: :class:`frozenset` containing Thai words.
     :rtype: :class:`frozenset`
     """
 
diff --git a/pythainlp/corpus/thai_orst_words.txt b/pythainlp/corpus/orst_words_th.txt
similarity index 100%
rename from pythainlp/corpus/thai_orst_words.txt
rename to pythainlp/corpus/orst_words_th.txt
diff --git a/pythainlp/corpus/volubilis.py b/pythainlp/corpus/volubilis.py
index b422fc30c..38906f125 100644
--- a/pythainlp/corpus/volubilis.py
+++ b/pythainlp/corpus/volubilis.py
@@ -8,25 +8,25 @@
 
 from pythainlp.corpus.common import get_corpus
 
-_VOLUBILIS = None
-_VOLUBILIS_FILENAME = "volubilis_modified.txt"
+_VOLUBILIS_WORDS = None
+_VOLUBILIS_FILENAME = "volubilis_words_th.txt"
 
 
-def volubilis() -> FrozenSet[str]:
+def thai_volubilis_words() -> FrozenSet[str]:
     """
-    Return a frozenset of words from the Volubilis dictionary.
+    Return a frozenset of Thai words from the Volubilis dictionary
+    
+    See: `dev/pythainlp/corpus/volubilis_words_th.txt\
+    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/volubilis_words_th.txt>`_
 
-    The data is at pythainlp/corpus/volubilis_modified.txt
-    The word list has beed prepared by the code at:
-    https://github.com/konbraphat51/Thai_Dictionary_Cleaner
-    Based Volubilis dictionary 23.1 (March 2023):
-    https://belisan-volubilis.blogspot.com/
+    More info:
+    https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md
 
-    :return: :class:`frozenset` containing words in the Volubilis dictionary.
+    :return: :class:`frozenset` containing Thai words.
     :rtype: :class:`frozenset`
     """
-    global _VOLUBILIS
-    if not _VOLUBILIS:
-        _VOLUBILIS = get_corpus(_VOLUBILIS_FILENAME)
+    global _VOLUBILIS_WORDS
+    if not _VOLUBILIS_WORDS:
+        _VOLUBILIS_WORDS = get_corpus(_VOLUBILIS_FILENAME, comments=False)
 
-    return _VOLUBILIS
+    return _VOLUBILIS_WORDS
diff --git a/pythainlp/corpus/volubilis_modified.txt b/pythainlp/corpus/volubilis_words_th.txt
similarity index 99%
rename from pythainlp/corpus/volubilis_modified.txt
rename to pythainlp/corpus/volubilis_words_th.txt
index 1d741eabd..39cb66495 100644
--- a/pythainlp/corpus/volubilis_modified.txt
+++ b/pythainlp/corpus/volubilis_words_th.txt
@@ -1,3 +1,5 @@
+# Thai words from Volubilis dictionary
+# SPDX-License-Identifier: CC-BY-SA-4.0
 อ๊ะ
 อา
 อา
diff --git a/pythainlp/corpus/wikipedia_titles.py b/pythainlp/corpus/wikipedia.py
similarity index 71%
rename from pythainlp/corpus/wikipedia_titles.py
rename to pythainlp/corpus/wikipedia.py
index a94c54022..abe39f10c 100644
--- a/pythainlp/corpus/wikipedia_titles.py
+++ b/pythainlp/corpus/wikipedia.py
@@ -9,24 +9,27 @@
 from pythainlp.corpus.common import get_corpus
 
 _WIKIPEDIA_TITLES = None
-_WIKIPEDIA_TITLES_FILENAME = "wikipedia_titles.txt"
+_WIKIPEDIA_TITLES_FILENAME = "wikipedia_titles_th.txt"
 
 
-def wikipedia_titles() -> FrozenSet[str]:
+def thai_wikipedia_titles() -> FrozenSet[str]:
     """
     Return a frozenset of words from Thai Wikipedia titles corpus.
     They are mostly nouns and noun phrases,
     including event, organization, people, place, and product names.
     Commonly misspelled words are included intentionally.
 
+    See: `dev/pythainlp/corpus/wikipedia_titles_th.txt\
+    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/wikipedia_titles_th.txt>`_
+
     More info:
     https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md
 
-    :return: :class:`frozenset` containing words in Thai Wikipedia titles.
+    :return: :class:`frozenset` containing Thai words.
     :rtype: :class:`frozenset`
     """
     global _WIKIPEDIA_TITLES
     if not _WIKIPEDIA_TITLES:
-        _WIKIPEDIA_TITLES = get_corpus(_WIKIPEDIA_TITLES_FILENAME)
+        _WIKIPEDIA_TITLES = get_corpus(_WIKIPEDIA_TITLES_FILENAME, comments=False)
 
     return _WIKIPEDIA_TITLES
diff --git a/pythainlp/corpus/wikipedia_titles.txt b/pythainlp/corpus/wikipedia_titles_th.txt
similarity index 99%
rename from pythainlp/corpus/wikipedia_titles.txt
rename to pythainlp/corpus/wikipedia_titles_th.txt
index 8d712413e..cb24f407c 100644
--- a/pythainlp/corpus/wikipedia_titles.txt
+++ b/pythainlp/corpus/wikipedia_titles_th.txt
@@ -1,3 +1,5 @@
+# Titles from Thai Wikipedia
+# SPDX-License-Identifier: CC-BY-SA-4.0
 ปากหวาน
 ทวิชกลิ่นประทุม
 รถไฟฟ้าเซี่ยงไฮ้