Merge pull request #62 from jsbrittain/main

Add additional settings and fix validation checks
kraemer-lab · Jul 30, 2023 · 3130e4e · 3130e4e
2 parents 3834d59 + a228165
commit 3130e4e
Show file tree

Hide file tree

Showing 29 changed files with 1,230 additions and 226 deletions.
diff --git a/builder/builder/builder.py b/builder/builder/builder.py
@@ -1,4 +1,5 @@
 import argparse
+import copy
 import json
 import pathlib
 import re
@@ -84,7 +85,7 @@ def _GetConfigFileinfo(self) -> Union[str, dict]:
             return filename
         if isinstance(self.snakefile, dict):
             # Remote file
-            c = self.snakefile.copy()
+            c = copy.deepcopy(self.snakefile)
             c["kwargs"]["path"] = c["kwargs"]["path"].replace(
                 workflow_filename, config_filename
             )
@@ -126,6 +127,11 @@ class Model:
     def __init__(self) -> None:
         """Initialise the model"""
         self.nodes: List[Node] = []  # List of Node objects
+        self.partial_build: bool = False
+
+    def SetPartialBuild(self, partial_build: bool) -> None:
+        """Sets the partial build flag (does not throw if a node is missing)"""
+        self.partial_build = partial_build
 
     def BuildSnakefile(
         self,
@@ -292,13 +298,19 @@ def AddConnector(self, name, connector) -> None:
         Example: Connect the output of module2 to the named input on module1
             connector = [ {"input1": "module2"}, "module1" ]
 
+        Error behaviour depends on the partial_build flag. If False then an
+        error is thrown if a node is not found. If True then the connector is
+        ignored and None returned.
+
         Args:
             name (str): Name of the connector
             connector (list): Connector definition
         """
         mapping = connector.get("map", None)
         node_to = self.GetNodeByName(mapping[1])
         if not node_to:
+            if self.partial_build:
+                return
             raise ValueError(
                 "No matching node found for connector source: "
                 "Requested '" + mapping[1] + "'"
@@ -310,13 +322,17 @@ def AddConnector(self, name, connector) -> None:
             for k, v in mapping[0].items():
                 incoming_node = self.GetNodeByName(v)
                 if not incoming_node:
+                    if self.partial_build:
+                        return
                     raise ValueError(
                         "No matching node found for connector source: " + v
                     )
                 node_to.input_namespace[k] = incoming_node.output_namespace
         else:
             node_from = self.GetNodeByName(mapping[0])
             if not node_from:
+                if self.partial_build:
+                    return
                 raise ValueError(
                     "No matching node found for connector destination: " + mapping[0]
                 )
@@ -579,7 +595,10 @@ def parse_struct(yl: dict):
                 c += f'["{key}"]={{}}\n'  # Create empty dict
                 c += "\n".join([f'["{key}"]{v}' for v in vv]) + "\n"
             elif isinstance(value, list):
-                raise Exception("Lists not supported in config")
+                c += f'["{key}"]=[]\n'  # Create empty list
+                for item in value:
+                    c += f'["{key}"].append("{item}")\n'
+                # raise Exception("Lists not supported in config")
             elif not value:
                 # Null
                 c += f'["{key}"]="None"\n'
@@ -620,8 +639,9 @@ def BuildFromJSON(
     config: dict,
     singlefile: bool = False,
     expand: bool = True,
-    build_path: str = "",
+    build_path: str = "build",
     clean_build: bool = True,
+    partial_build: bool = False,  # Don't throw an error if node is missing
 ) -> Tuple[Union[Tuple[str, str], bytes], Model]:
     """Builds a workflow from a JSON specification
 
@@ -630,6 +650,7 @@ def BuildFromJSON(
     With singlefile=False the workflow is a (zipped) directory structure.
     """
     m = Model()
+    m.SetPartialBuild(partial_build)
     # Add modules first to ensure all namespaces are defined before connectors
     for item in config:
         if item["type"].casefold() in ["module", "source", "terminal"]:

diff --git a/builder/builder/builder_web.py b/builder/builder/builder_web.py
@@ -59,10 +59,12 @@ def GetLocalModules(path: str) -> List[dict]:
                         config = yaml.safe_load(file)
                 except FileNotFoundError:
                     print(f"Config file not found - assuming blank: {file}")
+                module_classification = GetModuleClassification(config)
                 modules.append(
                     {
                         "name": f"({org}) {FormatName(workflow)}",
-                        "type": module_type[:-1],  # remove plural
+                        # "type": module_type[:-1],  # remove plural
+                        "type": module_classification,
                         "config": {
                             "snakefile": abspath(url_workflow),
                             "config": config,
@@ -159,10 +161,13 @@ def GetRemoteModulesGithubDirectoryListing(repo: str) -> List[dict]:
                         "Github API request failed (getting workflow config file)."
                     )
                 config = yaml.safe_load(r_config.text)
+                # Determine module type by config file, rather than directory name
+                module_classification = GetModuleClassification(config)
                 modules.append(
                     {
                         "name": f"({org['name']}) {FormatName(workflow['name'])}",
-                        "type": module_type["name"][:-1],  # remove plural
+                        # "type": module_type["name"][:-1],  # remove plural
+                        "type": module_classification,
                         "config": {
                             "snakefile": {
                                 "function": "github",
@@ -220,10 +225,12 @@ def GetRemoteModulesGithubBranchListing(repo: str) -> List[dict]:
         if r_config.status_code != 200:
             raise Exception("Github API request failed (getting workflow config file).")
         config = yaml.safe_load(r_config.text)
+        module_classification = GetModuleClassification(config)
         modules.append(
             {
                 "name": branch["name"],
-                "type": module_types[module_type],
+                # "type": module_types[module_type],
+                "type": module_classification,
                 "config": {
                     "snakefile": {
                         "function": "github",
@@ -241,6 +248,22 @@ def GetRemoteModulesGithubBranchListing(repo: str) -> List[dict]:
     return modules
 
 
+def GetModuleClassification(config: dict) -> str:
+    """Determine the module classification from the config file
+
+    Args:
+        config: module config file
+    """
+    # If config is None, then default to module
+    if config is None:
+        return "module"
+    # If the input namespace exists and is anything other than None, then it is
+    # a module
+    if config.get("input_namespace", "blank") is None:
+        return "source"
+    return "module"
+
+
 def GetWorkflowFiles(
     load_command: str,
 ) -> Tuple[str, str]:

diff --git a/docs/getting_started/builder.md b/docs/getting_started/builder.md
@@ -1,6 +1,4 @@
-# Quickstart
-
-## GRAPEVNE Builder
+# GRAPEVNE Builder
 
 GRAPEVNE Builder is the graphical interface that assists you in graphing,
 manipulating and building GRAPEVNE workflows.
@@ -12,27 +10,26 @@ run a simple workflow using several modules that are already available online.
 For this tutorial we will download, process and visualise some publically
 available Covid-19 data.
 
-### Loading modules
+## Loading modules
 
 To begin, open GRAPEVNE. This should start the
 application in the 'Builder' screen, where you will construct and test-run
 your workflows.
 
-There are various options available at the top of the screen. 
+There are various options available at the top of the screen.
 For this tutorial we will make use of pre-constructed modules available through
 our online repository. To access the repository ensure that 'Directory
 Listing (github)' is selected from the repository drop-down box at the top of
 the screen, and that `kraemer-lab/vneyard` (note the spelling) is displayed in
-the repository field. Then, click `Get Module List`. A message (`Loading remote
-modules`) will display while the listing loads. This will take a few seconds
+the repository field. Then, click `Get Module List`. A message (`Loading remote modules`) will display while the listing loads. This will take a few seconds
 and you will know when it completed as the list of available modules will
 display on the left-hand side of the screen.
 
 The module list is filterable - for this session we are interested in only the
 `Tutorial Builder` modules, so select this from the filter drop-down box (
 which is currently displaying `(all)`).
 
-### Adding your first module
+## Adding your first module
 
 In this tutorial we are interested in the number of new cases of Covid-19 that
 were reported to the World Health Organisation (WHO) at different times and across
@@ -50,7 +47,7 @@ appeared on the graph you can click on the node/module to edit its parameters.
 This module has been set up (by default) to download the `snakemake` logo from
 their website. However, we want to change this to download the WHO data.
 
-### Modify module parameters
+## Modify module parameters
 
 Click on the module and take a moment to look through the parameters list that
 is displayed on the right-hand side of the screen. Note that there is
@@ -60,6 +57,7 @@ which appears under `config-params-url`. Hovering your mouse over the URL text
 will change the background colour, indicating that is can be edited. Clicking
 on the text will open an edit box. Do this now, select all of the text and
 replace it with the location of the WHO data:
+
 ```
 https://covid19.who.int/WHO-COVID-19-global-data.csv
 ```
@@ -76,7 +74,7 @@ In addition to changing the URL location, we also want to rename the file
 after download for convenience. We can do this by editing the `filename` field
 (`config-params-filename`) and changing it to `data.csv`.
 
-### Connect modules together into workflows
+## Connect modules together into workflows
 
 We will now add two mode nodes to the graph. First, add the `Filter` node.
 The WHO dataset contains information on many countries, but we want to limit
@@ -107,7 +105,7 @@ screen into the canvas area, and connect the `Filter` node to it. Leave its
 parameters set to the defaults (for now). Now we are ready to (test) run our
 first workflow...
 
-### Test run a workflow
+## Test run a workflow
 
 GRAPEVNE is designed to build workflows into self-contained files that can be
 shared with others. However, we don't currently want to do that. Instead, we
@@ -132,7 +130,7 @@ Try this now: click `Build and Test`. Remember to open the `Terminal` in order
 to monitor progress. This is the perfect time to grab a fresh cup of tea while
 the environment loads - we will discuss why this takes so long shortly...
 
-### Your first graph
+## Your first graph
 
 You can monitor the status of your workflow from the `Terminal`, but if all
 goes well then you should see a figure pop-up onto your screed displaying
@@ -148,7 +146,7 @@ visualisation package. Visualising data is useful as a tutorial example, and
 can be very useful to periodically inspect the results of analyses, or to
 manually-inspect pre-processed inputs prior to starting long jobs.
 
-### Extending the graph
+## Extending the graph
 
 Let us now extend the analysis by considering seasonal variations. With your
 previous graph entact, add a new node `AggregateByMonth` and connect the
@@ -166,7 +164,7 @@ Your graph should now look (very similar) to this:
 :alt: Complete layout for the Builder Tutorial
 ```
 
-### Run your extended graph
+## Run your extended graph
 
 Click on `Build and Test` to run your extended graph. Since a lot of the
 set-up process was completed on the first run (although not all - we have
@@ -183,7 +181,7 @@ and after a few more seconds of processing time, the second graph will appear,
 revealing the seasonal fluctuations in new Covid-19 cases by month (numbered
 1-12 in this case) for our selected country.
 
-### Changing parameters
+## Changing parameters
 
 As you have seen already, changing the parameters of this analysis is extremely
 easy, you simply have to edit them by clicking on the relevant modules. Try this
@@ -192,12 +190,12 @@ if you are not sure of your country code). Click `Build and Test` to launch the
 analysis.
 
 ```{note}
-Snakemake will know that the parameters of the analysis have changed 
+Snakemake will know that the parameters of the analysis have changed
 and will automatically re-process the necessary steps to produce the
 required analysis.
 ```
 
-### Building workflows for use outside of GRAPEVNE
+## Building workflows for use outside of GRAPEVNE
 
 Although this extends slightly beyond the basic usage of GRAPEVNE, it is useful
 to consider two scenarios where we may want to build a workflow and distribute
@@ -210,16 +208,18 @@ run outside of GRAPEVNE.
 
 For example, to run your newly created workflow called `build.zip` outside of
 GRAPEVNE, simply unzip it, move into the `build` folder and type:
+
 ```
 snakemake --cores 1 $(snakemake --list)
 ```
+
 in the terminal (or command prompt). This will launch the same series of steps
 as you executed with the `Build and Test` button, but without needing the
 GRAPEVNE application. These can therefore be run locally, or remotely, as
 required and demonstrates another principal of GRAPEVNE: that workflows remains
 entirely compatible with modern Snakemake.
 
-### Building workflows for use as modules
+## Building workflows for use as modules
 
 The second basic scenario of workflow usage is using GRAPEVNE to create
 workflows composed of hierarchies of modules. This requires us to be able to
@@ -236,14 +236,15 @@ exposed in the editor for use.
 ## Summary
 
 You are now able to build a workflow to either
+
 1. distribute to others for use, or
 2. use as a module in another workflow
 
 Indeed, it is this form of hierarchical modularization that makes GRAPEVNE
 so powerful. However, to demonstrate this we will want to set-up our own
 repository for testing (which we discuss in the next set of tutorials).
 
-### Wait, but why is the workflow so slow on a first run?
+## Wait, but why is the workflow so slow on a first run?
 
 You may be wondering why the workflow was so slow to execute on its first run.
 After all, it was simply reading a file, filtering it, and plotting the result.

diff --git a/docs/getting_started/challenge.md b/docs/getting_started/challenge.md
@@ -0,0 +1,67 @@
+# Challenge
+
+This challenge is designed to allow you to engage with GRAPEVNE using your
+prefered programming or scripting language. For example, if you are comfortable
+programming in R, then try to complete the challenge in R.
+
+Build the following workflow using GRAPEVNE modules.
+
+## Outline
+
+Create a set of modules, and then combine that set in a hierarchy / composition
+to achieve the following: download a given file and calculate the following
+statistic for each letter of the alphabet:
+"number of words beginning with each letter of the alphabet, minus the number
+of words ending with that letter". For example,
+if there were 20 words in a given file that began with the letter 'a', and 25
+that ended with the letter 'a', then the output file should contain a list with
+each letter of the alphabet, accompanied by the calculated metric, so the first
+line would read "-5" in this case. The file should contain one line for every
+letter of the alphabet.
+
+As extended exercises:
+
+1. limit the analysis to only consider words of a
+   specified length (i.e. words between, say, 4-8 letters long)
+2. plot the results as a bar graph with a separate bar for each letter of the
+   alphabet.
+
+In order to make best use of GRAPEVNE modules and hierarchies, it is recommended
+to take the following approach:
+
+### Module 1: Download a words list
+
+Create a module (that runs in a conda environment) to download a list of words.
+
+Here is a list of English words: [https://github.com/dwyl/english-words/blob/master/words_alpha.txt](https://github.com/dwyl/english-words/blob/master/words_alpha.txt) (credit to: [https://github.com/dwyl/english-words](https://github.com/dwyl/english-words)).
+
+### Module 2: Count the number of words beginning with each letter of the alphabet
+
+Create a module that takes as input a text file, and produces as output a file
+listing each letter of the alphabet along with the number of words in the input
+file that began with that letter.
+
+### Module 3: Reverse a words
+
+Create a module that reverses the text on each line of an input file, and
+produces as output a file containing the reversed words.
+
+### Module 4: Subtract two numeric files from one another
+
+Create a module that takes as input _two_ files containing lists of numbers,
+and produces as output a single file containing the difference (i.e. `a-b`)
+in row-wise fashion (i.e. if file 1 contained lines `1 2 3 4 5` and file two
+contained lines `3 1 5 2 3` then the output would be `-2 1 -2 2 2`).
+
+### Extended Module 1: Filter by word length
+
+Create a module that takes a text file as input and produces another text file
+as output, where the output contains only those words that are within a
+specified word length. The parameters should be adjustable but could be, for
+example, larger than or equal to 4 letters and shorter than or equal to 8
+letters.
+
+### Extended Module 2: Bar graph
+
+Create a module that produces a bar graph given an input file consisting of a
+list of numbers.
diff --git a/.../getting_started/tutorial-5checkpoints.md → docs/getting_started/datadeps.md b/.../getting_started/tutorial-5checkpoints.md → docs/getting_started/datadeps.md