Grakn 1.5 migration (#58)

## What is the goal of this PR? - Update the documentation for KGCN - Migrate to use Grakn commit 20750ca0a46b4bc252ad81edccdfd8d8b7c46caa and Python grakn-client commit 5459d5d88a30631c5ebdac3a9b0d5ea6f184c8ae ## What are the changes implemented in this PR? - KGCN README improvements, corrections, fixes including updated diagrams - CI updates to use Grakn distributions hosted on GCP for unit, integration and end-to-end tests
typedb · Mar 5, 2019 · 26303e1 · 26303e1
1 parent 02015f5
commit 26303e1
Show file tree

Hide file tree

Showing 21 changed files with 83 additions and 76 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -19,10 +19,10 @@ jobs:
       - run: sudo apt-get update
       - run: pyenv install 3.6.3
       - run: pyenv global 3.6.3
-      - run: wget https://github.com/graknlabs/grakn/releases/download/v1.4.3/grakn-core-1.4.3.zip
-      - run: unzip grakn-core-1.4.3.zip
-      - run: nohup grakn-core-1.4.3/grakn server start
-      - run: grakn-core-1.4.3/graql console -k test_schema -f  kglib/kgcn/test_data/schema.gql
+      - run: wget https://storage.googleapis.com/kglib/grakn-core-all-20750ca0a46b4bc252ad81edccdfd8d8b7c46caa.zip
+      - run: unzip grakn-core-all-20750ca0a46b4bc252ad81edccdfd8d8b7c46caa.zip
+      - run: nohup grakn-core-all/grakn server start
+      - run: cd grakn-core-all && ./grakn console -k test_schema -f ../kglib/kgcn/test_data/schema.gql
       - run: bazel test //kglib/... --test_output=streamed --force_python PY3 --python_path $(which python)
 
   test-deploy-pip:

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.1
+0.1a3
diff --git a/WORKSPACE b/WORKSPACE
@@ -42,6 +42,6 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file")
 
 http_file(
   name = "animaltrade_dist",
-  urls = ["https://github.com/graknlabs/kglib/releases/download/v0.1a1/grakn-animaltrade.zip", # TODO How to update to the latest relase each time?
+  urls = ["https://storage.googleapis.com/kglib/grakn-core-animaltrade-20750ca0a46b4bc252ad81edccdfd8d8b7c46caa.zip", # TODO How to update to the latest relase each time?
   ]
 )
diff --git a/examples/BUILD b/examples/BUILD
@@ -36,7 +36,7 @@ py_library(
         requirement('grakn-kglib'),
 
         # Grakn deps
-        requirement('grakn'),
+        requirement('grakn-client'),
         requirement('grpcio'),
 
         # TensorFlow deps

diff --git a/examples/kgcn/animal_trade/prediction_schema.gql b/examples/kgcn/animal_trade/prediction_schema.gql
@@ -24,7 +24,7 @@ prediction-score sub attribute, datatype double;
 
 traded-item has endangerment-level;
 
-value-prediction sub relationship,
+value-prediction sub relation,
     has prediction-score,
     relates predicted-value,
     relates predicting-kgcn-model;
@@ -49,7 +49,7 @@ match $t1 isa traded-item, has endangerment-level $el1 via $r1; $el1 1; $vp1(pre
 
 define
 
-suspicious-activity-detection sub relationship,
+suspicious-activity-detection sub relation,
     relates suspicious-activity,
     relates cause-of-suspicion;
 

diff --git a/examples/kgcn/animal_trade/schema.gql b/examples/kgcn/animal_trade/schema.gql
@@ -61,7 +61,7 @@ define
         has unit-of-measurement,
         plays quantification-measurement;
 
-    exchange sub relationship,
+    exchange sub relation,
         relates receiving-country,
         relates providing-country,
         relates exchanged-item,
@@ -80,11 +80,11 @@ define
         relates imported-item as exchanged-item,
         plays corresponding-import;
 
-    import-export-correspondence sub relationship,
+    import-export-correspondence sub relation,
         relates corresponding-import,
         relates corresponding-export;
 
-    quantification sub relationship,
+    quantification sub relation,
         relates quantified-subject,
         relates quantification-measurement;
 
@@ -111,7 +111,7 @@ define
         plays originated-species,
         plays sub-taxon;
 
-    hierarchy sub relationship,
+    hierarchy sub relation,
         relates superior,
         relates subordinate;
 
@@ -131,11 +131,11 @@ define
         relates containing-continent as container,
         relates contained-country as containee;
 
-    species-origination sub relationship,
+    species-origination sub relation,
         relates originating-country,
         relates originated-species;
 
-    taxon-membership sub relationship,
+    taxon-membership sub relation,
         relates member-item,
         relates taxonomic-group;
 

diff --git a/examples/kgcn/animal_trade/test/end_to_end_test.py b/examples/kgcn/animal_trade/test/end_to_end_test.py
@@ -82,7 +82,7 @@ def test_end_to_end(self):
                           'external/animaltrade_dist/file/downloaded-unzipped'])
 
         # Start Grakn
-        sub.run(['external/animaltrade_dist/file/downloaded-unzipped/grakn-animaltrade/grakn', 'server', 'start'])
+        sub.run(['external/animaltrade_dist/file/downloaded-unzipped/grakn-core-animaltrade-1.5.0/grakn', 'server', 'start'])
 
         modes = (TRAIN, EVAL)
 

diff --git a/kglib/BUILD b/kglib/BUILD
@@ -138,7 +138,7 @@ py_library(
     srcs = glob(['__init__.py', 'kgcn/**/*.py']),
     deps = [
         # Grakn deps
-        requirement('grakn'),
+        requirement('grakn-client'),
         requirement('grpcio'),
 
         # TensorFlow deps

diff --git a/kglib/kgcn/README.md b/kglib/kgcn/README.md
@@ -13,15 +13,15 @@ A KGCN can be used to create vector representations, *embeddings*, of any labell
 
 Often, data doesn't fit well into a tabular format. There are many benefits to storing complex and interrelated data in a knowledge graph, not least that the context of each datapoint can be stored in full.
 
-However, many existing machine learning techniques rely upon an *input vector for each example*. This can make it difficult to directly apply many conventional machine learning techniques over a knowledge graph. 
+However, many existing machine learning techniques rely upon the existence of an *input vector for each example*. Creating such a vector to represent a node in a knowledge graph is non-trivial.
 
-In order to make use of the wealth of existing ideas, tools and pipelines in machine learning, we need a method of building a vector to describe a datapoint in a knowledge graph. In this way we can leverage contextual information from a knowledge graph for machine learning.
+In order to make use of the wealth of existing ideas, tools and pipelines in machine learning, we need a method of building these vectors. In this way we can leverage contextual information from a knowledge graph for machine learning.
 
-This is what a KGCN can achieve. Given an example datapoint taken from a knowledge graph, it can examine the nodes in the vicinity of an example, its *context*. Based on this context it can determine a vector representation, an *embedding*, for that example.
+This is what a KGCN can achieve. Given an example node in a knowledge graph, it can examine the nodes in the vicinity of that example, its *context*. Based on this context it can determine a vector representation, an *embedding*, for that example.
 
 **There are two broad learning tasks a KGCN is suitable for:**
 
-**1. Supervised learning from a knowledge graph for prediction e.g. multi-class classification (currently implemented), regression, link prediction**
+**1. Supervised learning from a knowledge graph for prediction e.g. multi-class classification (implemented), regression, link prediction**
 **2. Unsupervised creation of Knowledge Graph Embeddings, e.g. for clustering and node comparison tasks**
 
 ![KGCN Process](readme_images/KGCN_process.png)
@@ -46,7 +46,8 @@ In order to build a *useful* representation, a KGCN needs to perform some learni
 The following is a template of what must be defined in order to instantiate a KGCN, optimised for a downstream learning task of multi-class classification:
 
 ```python
-import kglib.kgcn.embed.model as model
+import kglib.kgcn.core.model as model
+import kglib.kgcn.learn.classify as classify
 import tensorflow as tf
 import grakn
 
@@ -65,10 +66,16 @@ kgcn = model.KGCN(neighbour_sample_sizes,
                   batch_size)
 
 optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
-classifier = learn.classify.SupervisedKGCNClassifier(kgcn, optimizer, num_classes, log_dir,
-                                                        max_training_steps=max_training_steps)
 
-training_feed_dict = classifier.get_feed_dict(session, training_things, labels=training_labels)
+classifier = classify.SupervisedKGCNClassifier(kgcn,
+                                               optimizer, 
+                                               num_classes, 
+                                               log_dir,
+                                               max_training_steps=max_training_steps)
+
+training_feed_dict = classifier.get_feed_dict(session, 
+                                              training_things, 
+                                              labels=training_labels)
 
 classifier.train(training_feed_dict)
 
@@ -80,17 +87,17 @@ There is also a [full example](https://github.com/graknlabs/kglib/tree/master/ex
 
 ## Methodology
 
-The ideology behind this project is described [here](https://blog.grakn.ai/knowledge-graph-convolutional-networks-machine-learning-over-reasoned-knowledge-9eb5ce5e0f68), and a [video of the presentation](https://youtu.be/Jx_Twc75ka0?t=368). The principles of the implementation are based on [GraphSAGE](http://snap.stanford.edu/graphsage/), from the Stanford SNAP group, heavily adapted to work over a knowledge graph. Instead of working on a typical property graph, a KGCN learns from the context of a *typed hypergraph*, **Grakn**. Additionally, it learns from facts deduced by Grakn's *automated logical reasoner*. From this point onwards some understanding of [Grakn's docs](http://dev.grakn.ai) is assumed.
+The ideology behind this project is described [here](https://blog.grakn.ai/knowledge-graph-convolutional-networks-machine-learning-over-reasoned-knowledge-9eb5ce5e0f68), and a [video of the presentation](https://youtu.be/Jx_Twc75ka0?t=368). The principles of the implementation are based on [GraphSAGE](http://snap.stanford.edu/graphsage/), from the Stanford SNAP group, heavily adapted to work over a knowledge graph. Instead of working on a typical property graph, a KGCN learns from contextual data stored in a *typed hypergraph*, **Grakn**. Additionally, it learns from facts deduced by Grakn's *automated logical reasoner*. From this point onwards some understanding of [Grakn's docs](http://dev.grakn.ai) is assumed.
 
 Now we introduce the key components and how they interact.
 
 ### KGCN
 
-A KGCN is responsible for deriving embeddings for a set of Things (and thereby directly learn to classify them). We start by querying Grakn to find a set of labelled examples. Following that, we gather data about the context of each example Thing. We do this by considering their *k-hop* neighbours.
+A KGCN is responsible for deriving embeddings for a set of Things (and thereby directly learn to classify them). We start by querying Grakn to find a set of labelled examples. Following that, we gather data about the context of each example Thing. We do this by considering their neighbours, and their neighbours' neighbours, recursively, up to K hops away.
 
-![methodology](readme_images/methodology.png)We retrieve the data concerning this neighbourhood from Grakn (diagram above). This information includes the *type hierarchy*, *roles*, and *attribute* values of each neighbouring Thing encountered, and any inferred neighbours (represented above by dotted lines).
+![methodology](readme_images/methodology.png)We retrieve the data concerning this neighbourhood from Grakn (diagram above). This information includes the *type hierarchy*, *roles*, and *attribute value* of each neighbouring Thing encountered, and any inferred neighbours (represented above by dotted lines). This data is compiled into arrays to be ingested by a neural network.
 
-Via operations Aggregate and Combine, a single vector representation is built for a Thing. This process can be chained recursively over k-hops of neighbouring Things. This builds a representation for a Thing of interest that contains information extracted from a wide context.
+Via operations Aggregate and Combine, a single vector representation is built for a Thing. This process can be chained recursively over *K* hops of neighbouring Things. This builds a representation for a Thing of interest that contains information extracted from a wide context.
 
 ![chaining](readme_images/chaining.png)
 
@@ -104,7 +111,7 @@ In order to feed a TensorFlow neural network, we need regular array structures o
 
 - Id
 - Type
-- Meta-Type (either Entity or Relationship or Attribute)
+- Meta-Type (either Entity or Relation or Attribute)
 - Data-type (if it's an attribute)
 - Value (if it's an attribute)
 - The Role that connects the example to that neighbour

diff --git a/kglib/kgcn/core/ingest/encode/encode.py b/kglib/kgcn/core/ingest/encode/encode.py
@@ -83,7 +83,7 @@ def __init__(self, schema_tx):
                 "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/1", 128)
 
             data_types = list(neighbour.DATA_TYPE_NAMES)
-            data_types.insert(0, NO_DATA_TYPE)  # For the case where an entity or relationship is encountered
+            data_types.insert(0, NO_DATA_TYPE)  # For the case where an entity or relation is encountered
             data_types_traversal = {data_type: data_types for data_type in data_types}
 
             # Later a hierarchy could be added to data_type meaning. e.g. long and double are both numeric

diff --git a/kglib/kgcn/core/ingest/traverse/data/context/builder.py b/kglib/kgcn/core/ingest/traverse/data/context/builder.py
@@ -58,7 +58,7 @@ def _traverse_from_thing(self, starting_thing: neighbour.Thing, depth: int, tx):
         sampler = self._depth_samplers[-depth]
         next_depth = depth - 1
 
-        # Any concept could play a role in a relationship if the schema permits it
+        # Any concept could play a role in a relation if the schema permits it
         # Distinguish the concepts found as roles-played
         connections = self._neighbour_finder.find(starting_thing.id, tx)
 

diff --git a/kglib/kgcn/core/ingest/traverse/data/context/builder_mocks.py b/kglib/kgcn/core/ingest/traverse/data/context/builder_mocks.py
@@ -31,15 +31,15 @@ def mock_traversal_output():
         neighbour.Thing("0", "person", "entity"),
         [
             builder.Neighbour("employee", neighbour.TARGET_PLAYS, builder.ThingContext(
-                neighbour.Thing("1", "employment", "relationship"),
+                neighbour.Thing("1", "employment", "relation"),
                 [
                     builder.Neighbour("employer", neighbour.NEIGHBOUR_PLAYS, builder.ThingContext(
                         neighbour.Thing("2", "company", "entity"), []
                     )),
                 ]
             )),
             builder.Neighbour("@has-name-owner", neighbour.TARGET_PLAYS, builder.ThingContext(
-                neighbour.Thing("3", "@has-name", "relationship"),
+                neighbour.Thing("3", "@has-name", "relation"),
                 [
                     builder.Neighbour("@has-name-value", neighbour.NEIGHBOUR_PLAYS, builder.ThingContext(
                         neighbour.Thing("4", "name", "attribute", data_type='string', value="Employee Name"),
@@ -67,8 +67,8 @@ def find(self, thing_id, tx):
 
             role_direction = neighbour.TARGET_PLAYS
             yield from gen([
-                _build_data("employee", role_direction, "1", "employment", "relationship"),
-                _build_data("@has-name-owner", role_direction, "3", "@has-name", "relationship")
+                _build_data("employee", role_direction, "1", "employment", "relation"),
+                _build_data("@has-name-owner", role_direction, "3", "@has-name", "relation")
             ])
 
         elif thing_id == "1":