From e1760a5bea040eb7d900abd3a333ec13acf4a2e8 Mon Sep 17 00:00:00 2001
From: Made Nityasya <made.nityasya@shopee.com>
Date: Thu, 13 Oct 2022 14:33:24 +0800
Subject: [PATCH 1/2] Initial multilingual_open_relations dataset

---
 .../multilingual_open_relations/__init__.py   |   0
 .../multilingual_open_relations.py            | 260 ++++++++++++++++++
 nusacrowd/utils/constants.py                  |   2 +
 3 files changed, 262 insertions(+)
 create mode 100644 nusacrowd/nusa_datasets/multilingual_open_relations/__init__.py
 create mode 100644 nusacrowd/nusa_datasets/multilingual_open_relations/multilingual_open_relations.py

diff --git a/nusacrowd/nusa_datasets/multilingual_open_relations/__init__.py b/nusacrowd/nusa_datasets/multilingual_open_relations/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nusacrowd/nusa_datasets/multilingual_open_relations/multilingual_open_relations.py b/nusacrowd/nusa_datasets/multilingual_open_relations/multilingual_open_relations.py
new file mode 100644
index 00000000..cf30f8df
--- /dev/null
+++ b/nusacrowd/nusa_datasets/multilingual_open_relations/multilingual_open_relations.py
@@ -0,0 +1,260 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from nusacrowd.utils import schemas
+from nusacrowd.utils.configs import NusantaraConfig
+from nusacrowd.utils.constants import Tasks
+
+_CITATION = """\
+@inproceedings{faruqui-kumar-2015-multilingual,
+    title = "Multilingual Open Relation Extraction Using Cross-lingual Projection",
+    author = "Faruqui, Manaal  and
+      Kumar, Shankar",
+    booktitle = "Proceedings of the 2015 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies",
+    month = may # "{--}" # jun,
+    year = "2015",
+    address = "Denver, Colorado",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/N15-1151",
+    doi = "10.3115/v1/N15-1151",
+    pages = "1351--1356",
+}
+"""
+
+_DATASETNAME = "multilingual_open_relations"
+
+_DESCRIPTION = """\
+Relation extraction is the task of assigning a semantic relationship between a pair of arguments. This dataset provides automatically extracted relations obtained using the algorithm in Faruqui and Kumar (2015).
+Faruqui and Kumar (2015) describe a cross-lingual projection algorithm for multilingual RE that translates text from a foreign language to English, performs relation extraction in English and then projects these relations back to the foreign language.
+"""
+
+_HOMEPAGE = "https://www.kaggle.com/datasets/shankkumar/multilingualopenrelations15"
+
+_LICENSE = "Attribution 3.0 Unported (CC BY 3.0)"
+
+_LANGUAGES = ["ind"]
+
+_URLS = {
+    _DATASETNAME: "local_dataset/multilingual_open_relations-auto-extractions-ind", # TODO: update
+}
+
+_SUPPORTED_TASKS = [Tasks.RELATION_EXTRACTION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_NUSANTARA_VERSION = "1.0.0"
+
+
+class NewDataset(datasets.GeneratorBasedBuilder):
+    """Relation extraction is the task of assigning a semantic relationship between a pair of arguments. This dataset provides automatically extracted relations obtained using the algorithm in Faruqui and Kumar (2015)."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION)
+
+    BUILDER_CONFIGS = [
+        NusantaraConfig(
+            name="multilingual_open_relations_source",
+            version=SOURCE_VERSION,
+            description="Multilingual Open Relations source schema",
+            schema="source",
+            subset_id="multilingual_open_relations",
+        ),
+        NusantaraConfig(
+            name="multilingual_open_relations_nusantara_kb",
+            version=NUSANTARA_VERSION,
+            description="Multilingual Open Relations Nusantara schema",
+            schema="nusantara_kb",
+            subset_id="multilingual_open_relations",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "multilingual_open_relations_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            # TODO: update if necessary
+            features = datasets.Features({
+                    "index": datasets.Value("string"),
+                    "wikipedia_url": datasets.Value("string"),
+                    "sentence": datasets.Value("string"),
+                    "sentence_en": datasets.Value("string"),
+                    "relations": [{
+                        "argument_1": datasets.Value("string"),
+                        "argument_2": datasets.Value("string"),
+                        "relation": datasets.Value("string"),
+                        "argument_1_en": datasets.Value("string"),
+                        "argument_2_en": datasets.Value("string"),
+                        "relation_en": datasets.Value("string"),
+                    }]
+                })
+
+        elif self.config.schema == "nusantara_kb":
+            features = schemas.kb_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+
+        urls = _URLS[_DATASETNAME]
+
+        # data_dir = dl_manager.download_and_extract(urls)  # TODO: update to get from url
+        url_path = Path(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": url_path,
+                    "split": "train",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        rows = self._read_from_source_file(filepath)
+
+        if self.config.schema == "source":
+            for idx, row in enumerate(rows):
+                row["index"] = str(idx)
+                yield idx, row
+
+        elif self.config.schema == "nusantara_kb":
+            for idx, row in enumerate(rows):
+                row = self._to_nusa_kb_scheme(idx, row)
+                yield idx, row
+                
+        else:
+            raise ValueError(f"Invalid config: {self.config.name}")
+
+    def _read_from_source_file(self, filepath: Path):
+
+        """
+        Original Data format is the following:
+        Wikipedia URL ||| Source Language (SL) Sentence ||| Argument 1 in SL ||| Relation in SL ||| Argument 2 in SL ||| English Sentence ||| Argument 1 in English ||| Relation in English ||| Argument 2 in English
+        """
+
+        def parse_row(line):
+            sections = line.split("|||")
+            row = {
+                "wikipedia_url": sections[0].strip(),
+                "sentence": sections[1].strip(),
+                "argument_1": sections[2].strip(),
+                "argument_2": sections[4].strip(),
+                "relation": sections[3].strip(),
+                "sentence_en": sections[5].strip(),
+                "argument_1_en": sections[6].strip(),
+                "argument_2_en": sections[8].strip(),
+                "relation_en": sections[7].strip(),
+            }
+            return row
+
+        map_url_sentence_to_idx = {}
+        data = []
+                
+        with open(filepath, "r+") as fr:
+            for line in fr:
+                row = parse_row(line)                
+                
+                url_sentence = f"{row['wikipedia_url']}_{row['sentence']}"
+                if url_sentence not in map_url_sentence_to_idx:
+                    map_url_sentence_to_idx[url_sentence] = len(map_url_sentence_to_idx)
+                    data.append({
+                        "wikipedia_url": row["wikipedia_url"],
+                        "sentence": row["sentence"],
+                        "sentence_en": row["sentence_en"],
+                        "relations": []
+                    })
+                rel = {
+                    "argument_1": row["argument_1"],
+                    "argument_2": row["argument_2"],
+                    "relation": row["relation"],
+                    "argument_1_en": row["argument_1_en"],
+                    "argument_2_en": row["argument_2_en"],
+                    "relation_en": row["relation_en"],
+                }
+                data[map_url_sentence_to_idx[url_sentence]]["relations"].append(rel)
+        return data
+    
+    def _to_nusa_kb_scheme(self, idx, row):
+        
+        rel_id = 0
+        ent_id = 0
+
+        relations = []
+        entities = []
+
+        def get_entity(ent_id, entity_str):
+            i = f"{idx}_EntID_{ent_id}"
+            entity = {
+                "id": i,
+                "type": "",
+                "text": [entity_str],
+                "offsets": [[0, 0]], # TODO: calculate the offset
+                "normalized": [],
+            }
+            ent_id += 1
+            return i, ent_id, entity
+
+        for rel in row["relations"]:             
+            id_1, ent_id, ent_1 = get_entity(ent_id, rel["argument_1"])
+            id_2, ent_id, ent_2 = get_entity(ent_id, rel["argument_2"])
+            entities.append(ent_1)
+            entities.append(ent_2)
+            relations.append({
+                "id": f"{idx}_RelID_{rel_id}",
+                "type": rel["relation"],
+                "arg1_id": id_1,
+                "arg2_id": id_2,
+                "normalized": [
+                    {
+                        "db_name": None,
+                        "db_id": None,
+                    }
+                ]
+            })
+            rel_id += 1
+
+        nusa_scheme = {
+            "id": str(idx),
+            "passages": [
+                {
+                    "id": f"{idx}_PsgID_0", 
+                    "type": "text", 
+                    "text": [row["sentence"]], 
+                    "offsets": [
+                        [0, len(row["sentence"])]
+                    ]
+                }
+            ],
+            "entities": entities,
+            "coreferences": [],
+            "events": [],
+            "relations": relations,
+        }
+        return nusa_scheme
diff --git a/nusacrowd/utils/constants.py b/nusacrowd/utils/constants.py
index 53c28f0c..e61c0eb6 100644
--- a/nusacrowd/utils/constants.py
+++ b/nusacrowd/utils/constants.py
@@ -35,6 +35,7 @@ class Tasks(Enum):
     WORD_SENSE_DISAMBIGUATION = "WSD"
     KEYWORD_EXTRACTION = "KE"
     COREFERENCE_RESOLUTION = "COREF"
+    RELATION_EXTRACTION = "RE"
 
     # Single Text Classification
     SENTIMENT_ANALYSIS = "SA"
@@ -93,6 +94,7 @@ class Tasks(Enum):
     Tasks.KEYWORD_EXTRACTION: "SEQ_LABEL",
     Tasks.COREFERENCE_RESOLUTION: "KB",
     Tasks.DIALOGUE_SYSTEM: "KB",
+    Tasks.RELATION_EXTRACTION: "KB",
     Tasks.NAMED_ENTITY_RECOGNITION: "SEQ_LABEL",
     Tasks.POS_TAGGING: "SEQ_LABEL",
     Tasks.KEYWORD_TAGGING: "SEQ_LABEL",

From 0d755b39fd4cab791c8a2b3bc9599936c49bd203 Mon Sep 17 00:00:00 2001
From: Made Nityasya <made.nityasya@shopee.com>
Date: Thu, 13 Oct 2022 14:37:18 +0800
Subject: [PATCH 2/2] Reformat

---
 .../multilingual_open_relations.py            | 92 +++++++++----------
 1 file changed, 42 insertions(+), 50 deletions(-)

diff --git a/nusacrowd/nusa_datasets/multilingual_open_relations/multilingual_open_relations.py b/nusacrowd/nusa_datasets/multilingual_open_relations/multilingual_open_relations.py
index cf30f8df..4f11273a 100644
--- a/nusacrowd/nusa_datasets/multilingual_open_relations/multilingual_open_relations.py
+++ b/nusacrowd/nusa_datasets/multilingual_open_relations/multilingual_open_relations.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from pathlib import Path
 from typing import Dict, List, Tuple
 
@@ -42,8 +41,9 @@
 _DATASETNAME = "multilingual_open_relations"
 
 _DESCRIPTION = """\
-Relation extraction is the task of assigning a semantic relationship between a pair of arguments. This dataset provides automatically extracted relations obtained using the algorithm in Faruqui and Kumar (2015).
-Faruqui and Kumar (2015) describe a cross-lingual projection algorithm for multilingual RE that translates text from a foreign language to English, performs relation extraction in English and then projects these relations back to the foreign language.
+Relation extraction is the task of assigning a semantic relationship between a pair of arguments.
+This dataset provides automatically extracted relations obtained using the algorithm in Faruqui and Kumar (2015).
+It use cross-lingual projection algorithm for multilingual RE that translates text from a foreign language to English, performs relation extraction in English and then projects these relations back to the foreign language.
 """
 
 _HOMEPAGE = "https://www.kaggle.com/datasets/shankkumar/multilingualopenrelations15"
@@ -53,7 +53,7 @@
 _LANGUAGES = ["ind"]
 
 _URLS = {
-    _DATASETNAME: "local_dataset/multilingual_open_relations-auto-extractions-ind", # TODO: update
+    _DATASETNAME: "local_dataset/multilingual_open_relations-auto-extractions-ind",  # TODO: update
 }
 
 _SUPPORTED_TASKS = [Tasks.RELATION_EXTRACTION]
@@ -92,20 +92,24 @@ def _info(self) -> datasets.DatasetInfo:
 
         if self.config.schema == "source":
             # TODO: update if necessary
-            features = datasets.Features({
+            features = datasets.Features(
+                {
                     "index": datasets.Value("string"),
                     "wikipedia_url": datasets.Value("string"),
                     "sentence": datasets.Value("string"),
                     "sentence_en": datasets.Value("string"),
-                    "relations": [{
-                        "argument_1": datasets.Value("string"),
-                        "argument_2": datasets.Value("string"),
-                        "relation": datasets.Value("string"),
-                        "argument_1_en": datasets.Value("string"),
-                        "argument_2_en": datasets.Value("string"),
-                        "relation_en": datasets.Value("string"),
-                    }]
-                })
+                    "relations": [
+                        {
+                            "argument_1": datasets.Value("string"),
+                            "argument_2": datasets.Value("string"),
+                            "relation": datasets.Value("string"),
+                            "argument_1_en": datasets.Value("string"),
+                            "argument_2_en": datasets.Value("string"),
+                            "relation_en": datasets.Value("string"),
+                        }
+                    ],
+                }
+            )
 
         elif self.config.schema == "nusantara_kb":
             features = schemas.kb_features
@@ -148,7 +152,7 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
             for idx, row in enumerate(rows):
                 row = self._to_nusa_kb_scheme(idx, row)
                 yield idx, row
-                
+
         else:
             raise ValueError(f"Invalid config: {self.config.name}")
 
@@ -176,20 +180,15 @@ def parse_row(line):
 
         map_url_sentence_to_idx = {}
         data = []
-                
+
         with open(filepath, "r+") as fr:
             for line in fr:
-                row = parse_row(line)                
-                
+                row = parse_row(line)
+
                 url_sentence = f"{row['wikipedia_url']}_{row['sentence']}"
                 if url_sentence not in map_url_sentence_to_idx:
                     map_url_sentence_to_idx[url_sentence] = len(map_url_sentence_to_idx)
-                    data.append({
-                        "wikipedia_url": row["wikipedia_url"],
-                        "sentence": row["sentence"],
-                        "sentence_en": row["sentence_en"],
-                        "relations": []
-                    })
+                    data.append({"wikipedia_url": row["wikipedia_url"], "sentence": row["sentence"], "sentence_en": row["sentence_en"], "relations": []})
                 rel = {
                     "argument_1": row["argument_1"],
                     "argument_2": row["argument_2"],
@@ -200,9 +199,9 @@ def parse_row(line):
                 }
                 data[map_url_sentence_to_idx[url_sentence]]["relations"].append(rel)
         return data
-    
+
     def _to_nusa_kb_scheme(self, idx, row):
-        
+
         rel_id = 0
         ent_id = 0
 
@@ -215,43 +214,36 @@ def get_entity(ent_id, entity_str):
                 "id": i,
                 "type": "",
                 "text": [entity_str],
-                "offsets": [[0, 0]], # TODO: calculate the offset
+                "offsets": [[0, 0]],  # TODO: calculate the offset
                 "normalized": [],
             }
             ent_id += 1
             return i, ent_id, entity
 
-        for rel in row["relations"]:             
+        for rel in row["relations"]:
             id_1, ent_id, ent_1 = get_entity(ent_id, rel["argument_1"])
             id_2, ent_id, ent_2 = get_entity(ent_id, rel["argument_2"])
             entities.append(ent_1)
             entities.append(ent_2)
-            relations.append({
-                "id": f"{idx}_RelID_{rel_id}",
-                "type": rel["relation"],
-                "arg1_id": id_1,
-                "arg2_id": id_2,
-                "normalized": [
-                    {
-                        "db_name": None,
-                        "db_id": None,
-                    }
-                ]
-            })
+            relations.append(
+                {
+                    "id": f"{idx}_RelID_{rel_id}",
+                    "type": rel["relation"],
+                    "arg1_id": id_1,
+                    "arg2_id": id_2,
+                    "normalized": [
+                        {
+                            "db_name": None,
+                            "db_id": None,
+                        }
+                    ],
+                }
+            )
             rel_id += 1
 
         nusa_scheme = {
             "id": str(idx),
-            "passages": [
-                {
-                    "id": f"{idx}_PsgID_0", 
-                    "type": "text", 
-                    "text": [row["sentence"]], 
-                    "offsets": [
-                        [0, len(row["sentence"])]
-                    ]
-                }
-            ],
+            "passages": [{"id": f"{idx}_PsgID_0", "type": "text", "text": [row["sentence"]], "offsets": [[0, len(row["sentence"])]]}],
             "entities": entities,
             "coreferences": [],
             "events": [],