From f7c186027175edd4e90d59d936e81d3edbd1e15d Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Mon, 24 Jul 2023 09:58:11 -0500
Subject: [PATCH 01/10] rough draft refactor gather

---
 openfecli/commands/gather.py | 221 ++++++++++++++++++++---------------
 1 file changed, 129 insertions(+), 92 deletions(-)

diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index 225226eba..f446cfd70 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -51,6 +51,116 @@ def legacy_get_type(res_fn):
         return 'complex'
 
 
+def _get_ddgs(legs):
+    DDGs = []
+    for ligpair, vals in sorted(legs.items()):
+        DDGbind = None
+        DDGhyd = None
+        bind_unc = None
+        hyd_unc = None
+
+        if 'complex' in vals and 'solvent' in vals:
+            DG1_mag, DG1_unc = vals['complex']
+            DG2_mag, DG2_unc = vals['solvent']
+            if not ((DG1_mag is None) or (DG2_mag is None)):
+                # DDG(2,1)bind = DG(1->2)complex - DG(1->2)solvent
+                DDGbind = (DG1_mag - DG2_mag).m
+                bind_unc = np.sqrt(np.sum(np.square([DG1_unc.m, DG2_unc.m])))
+        elif 'solvent' in vals and 'vacuum' in vals:
+            DG1_mag, DG1_unc = vals['solvent']
+            DG2_mag, DG2_unc = vals['vacuum']
+            if not ((DG1_mag is None) or (DG2_mag is None)):
+                DDGhyd = (DG1_mag - DG2_mag).m
+                hyd_unc = np.sqrt(np.sum(np.square([DG1_unc.m, DG2_unc.m])))
+        else:  # -no-cov-
+            raise RuntimeError(f"Unknown DDG type for {vals}")
+
+        DDGs.append((*ligpair, DDGbind, bind_unc, DDGhyd, hyd_unc))
+
+    return DDGs
+
+def _write_ddg(legs, output):
+    DDGs = _get_ddgs(legs)
+    for ligA, ligB, DDGbind, bind_unc, DDGhyd, hyd_unc in DDGs:
+        name = f"{ligB}, {ligA}"
+        if DDGbind is not None:
+            DDGbind, bind_unc = dp2(DDGbind), dp2(bind_unc)
+            output.write(f'DDGbind({name})\tRBFE\t{ligA}\t{ligB}'
+                         f'\t{DDGbind}\t{bind_unc}\n')
+        if DDGhyd is not None:
+            DDGhyd, hyd_unc = dp2(DDGhyd), dp2(hyd_unc)
+            output.write(f'DDGhyd({name})\tRHFE\t{ligA}\t{ligB}\t'
+                         f'{DDGhyd}\t{hyd_unc}\n')
+
+
+    ...
+
+def _write_raw_dg(legs, output):
+    for ligpair, vals in sorted(legs.items()):
+        name = ', '.join(ligpair)
+        for simtype, (m, u) in sorted(vals.items()):
+            if m is None:
+                m, u = 'NaN', 'NaN'
+            else:
+                m, u = dp2(m.m), dp2(u.m)
+            output.write(f'DG{simtype}({name})\t{simtype}\t{ligpair[0]}\t'
+                         f'{ligpair[1]}\t{m}\t{u}\n')
+
+def _write_dg_mle(legs, output):
+    MLEs = []
+    # 4b) perform MLE
+    g = nx.DiGraph()
+    nm_to_idx = {}
+    DDGbind_count = 0
+    for ligA, ligB, DDGbind, bind_unc, DDGhyd, hyd_unc in DDGs:
+        if DDGbind is None:
+            continue
+        DDGbind_count += 1
+
+        # tl;dr this is super paranoid, but safer for now:
+        # cinnabar seems to rely on the ordering of values within the graph
+        # to correspond to the matrix that comes out from mle()
+        # internally they also convert the ligand names to ints, which I think
+        # has a side effect of giving the graph nodes a predictable order.
+        # fwiw this code didn't affect ordering locally
+        try:
+            idA = nm_to_idx[ligA]
+        except KeyError:
+            idA = len(nm_to_idx)
+            nm_to_idx[ligA] = idA
+        try:
+            idB = nm_to_idx[ligB]
+        except KeyError:
+            idB = len(nm_to_idx)
+            nm_to_idx[ligB] = idB
+
+        g.add_edge(
+            idA, idB, calc_DDG=DDGbind, calc_dDDG=bind_unc,
+        )
+    if DDGbind_count > 2:
+        idx_to_nm = {v: k for k, v in nm_to_idx.items()}
+
+        f_i, df_i = mle(g, factor='calc_DDG')
+        df_i = np.diagonal(df_i) ** 0.5
+
+        for node, f, df in zip(g.nodes, f_i, df_i):
+            ligname = idx_to_nm[node]
+            MLEs.append((ligname, f, df))
+
+    for ligA, DG, unc_DG in MLEs:
+        DG, unc_DG = dp2(DG), dp2(unc_DG)
+        output.write(f'DGbind({ligA})\tDG(MLE)\tZero\t{ligA}\t{DG}\t{unc_DG}\n')
+
+
+def dp2(v: float) -> str:
+    # turns 0.0012345 -> '0.0012', round() would get this wrong
+    return np.format_float_positional(v, precision=2, trim='0',
+                                      fractional=False)
+
+
+
+
+
 @click.command(
     'gather',
     short_help="Gather result jsons for network of RFE results into a TSV file"
@@ -59,10 +169,20 @@ def legacy_get_type(res_fn):
                 type=click.Path(dir_okay=True, file_okay=False,
                                          path_type=pathlib.Path),
                 required=True)
+@click.option(
+    '--report',
+    type=click.Choice(['dg', 'ddg', 'leg'], case_sensitive=False),
+    default="dg", show_default=True,
+    help=(
+        "What data to report. 'dg' gives maximum-likelihood estimate of "
+        "asbolute deltaG,  'ddg' gives delta-delta-G, and 'leg' gives the "
+        "raw result of the deltaG for a leg."
+    )
+)
 @click.option('output', '-o',
               type=click.File(mode='w'),
               default='-')
-def gather(rootdir, output):
+def gather(rootdir, output, report):
     """Gather simulation result jsons of relative calculations to a tsv file
 
     Will walk ROOTDIR recursively and find all results files ending in .json
@@ -96,11 +216,6 @@ def gather(rootdir, output):
     import numpy as np
     from cinnabar.stats import mle
 
-    def dp2(v: float) -> str:
-        # turns 0.0012345 -> '0.0012', round() would get this wrong
-        return np.format_float_positional(v, precision=2, trim='0',
-                                          fractional=False)
-
     # 1) find all possible jsons
     json_fns = glob.glob(str(rootdir) + '/**/*json', recursive=True)
 
@@ -130,98 +245,20 @@ def dp2(v: float) -> str:
         legs[names][simtype] = result['estimate'], result['uncertainty']
 
     # 4a for each ligand pair, resolve legs
-    DDGs = []
-    for ligpair, vals in sorted(legs.items()):
-        DDGbind = None
-        DDGhyd = None
-        bind_unc = None
-        hyd_unc = None
-
-        if 'complex' in vals and 'solvent' in vals:
-            DG1_mag, DG1_unc = vals['complex']
-            DG2_mag, DG2_unc = vals['solvent']
-            if not ((DG1_mag is None) or (DG2_mag is None)):
-                # DDG(2,1)bind = DG(1->2)complex - DG(1->2)solvent
-                DDGbind = (DG1_mag - DG2_mag).m
-                bind_unc = np.sqrt(np.sum(np.square([DG1_unc.m, DG2_unc.m])))
-        if 'solvent' in vals and 'vacuum' in vals:
-            DG1_mag, DG1_unc = vals['solvent']
-            DG2_mag, DG2_unc = vals['vacuum']
-            if not ((DG1_mag is None) or (DG2_mag is None)):
-                DDGhyd = (DG1_mag - DG2_mag).m
-                hyd_unc = np.sqrt(np.sum(np.square([DG1_unc.m, DG2_unc.m])))
-
-        DDGs.append((*ligpair, DDGbind, bind_unc, DDGhyd, hyd_unc))
-
-    MLEs = []
-    # 4b) perform MLE
-    g = nx.DiGraph()
-    nm_to_idx = {}
-    DDGbind_count = 0
-    for ligA, ligB, DDGbind, bind_unc, DDGhyd, hyd_unc in DDGs:
-        if DDGbind is None:
-            continue
-        DDGbind_count += 1
-
-        # tl;dr this is super paranoid, but safer for now:
-        # cinnabar seems to rely on the ordering of values within the graph
-        # to correspond to the matrix that comes out from mle()
-        # internally they also convert the ligand names to ints, which I think
-        # has a side effect of giving the graph nodes a predictable order.
-        # fwiw this code didn't affect ordering locally
-        try:
-            idA = nm_to_idx[ligA]
-        except KeyError:
-            idA = len(nm_to_idx)
-            nm_to_idx[ligA] = idA
-        try:
-            idB = nm_to_idx[ligB]
-        except KeyError:
-            idB = len(nm_to_idx)
-            nm_to_idx[ligB] = idB
-
-        g.add_edge(
-            idA, idB, calc_DDG=DDGbind, calc_dDDG=bind_unc,
-        )
-    if DDGbind_count > 2:
-        idx_to_nm = {v: k for k, v in nm_to_idx.items()}
-
-        f_i, df_i = mle(g, factor='calc_DDG')
-        df_i = np.diagonal(df_i) ** 0.5
-
-        for node, f, df in zip(g.nodes, f_i, df_i):
-            ligname = idx_to_nm[node]
-            MLEs.append((ligname, f, df))
+    DDGs = _get_ddgs(legs)
 
     output.write('measurement\ttype\tligand_i\tligand_j\testimate (kcal/mol)'
                  '\tuncertainty (kcal/mol)\n')
-    # 5a) write out MLE values
-    for ligA, DG, unc_DG in MLEs:
-        DG, unc_DG = dp2(DG), dp2(unc_DG)
-        output.write(f'DGbind({ligA})\tDG(MLE)\tZero\t{ligA}\t{DG}\t{unc_DG}\n')
 
+    # 5a) write out MLE values
     # 5b) write out DDG values
-    for ligA, ligB, DDGbind, bind_unc, DDGhyd, hyd_unc in DDGs:
-        name = f"{ligB}, {ligA}"
-        if DDGbind is not None:
-            DDGbind, bind_unc = dp2(DDGbind), dp2(bind_unc)
-            output.write(f'DDGbind({name})\tRBFE\t{ligA}\t{ligB}'
-                         f'\t{DDGbind}\t{bind_unc}\n')
-        if DDGhyd is not None:
-            DDGhyd, hyd_unc = dp2(DDGhyd), dp2(hyd_unc)
-            output.write(f'DDGhyd({name})\tRHFE\t{ligA}\t{ligB}\t'
-                         f'{DDGhyd}\t{hyd_unc}\n')
-
     # 5c) write out each leg
-    for ligpair, vals in sorted(legs.items()):
-        name = ', '.join(ligpair)
-        for simtype, (m, u) in sorted(vals.items()):
-            if m is None:
-                m, u = 'NaN', 'NaN'
-            else:
-                m, u = dp2(m.m), dp2(u.m)
-            output.write(f'DG{simtype}({name})\t{simtype}\t{ligpair[0]}\t'
-                         f'{ligpair[1]}\t{m}\t{u}\n')
+    writing_func = {
+        'dg': _write_dg_mle,
+        'ddg': _write_ddg,
+        'leg': _write_raw_dg,
+    }[report.lower()]
+    writing_func(legs, output)
 
 
 PLUGIN = OFECommandPlugin(

From c368ca9bf7886e13235f4b6604cafd7f87b1fdcb Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Mon, 24 Jul 2023 10:10:17 -0500
Subject: [PATCH 02/10] version that seems to work

now to update to make output cleaner
---
 openfecli/commands/gather.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index f446cfd70..9bb5d5edb 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -52,6 +52,7 @@ def legacy_get_type(res_fn):
 
 
 def _get_ddgs(legs):
+    import numpy as np
     DDGs = []
     for ligpair, vals in sorted(legs.items()):
         DDGbind = None
@@ -107,6 +108,10 @@ def _write_raw_dg(legs, output):
                          f'{ligpair[1]}\t{m}\t{u}\n')
 
 def _write_dg_mle(legs, output):
+    import networkx as nx
+    import numpy as np
+    from cinnabar.stats import mle
+    DDGs = _get_ddgs(legs)
     MLEs = []
     # 4b) perform MLE
     g = nx.DiGraph()
@@ -154,6 +159,7 @@ def _write_dg_mle(legs, output):
 
 def dp2(v: float) -> str:
     # turns 0.0012345 -> '0.0012', round() would get this wrong
+    import numpy as np
     return np.format_float_positional(v, precision=2, trim='0',
                                       fractional=False)
 
@@ -212,9 +218,6 @@ def gather(rootdir, output, report):
     """
     from collections import defaultdict
     import glob
-    import networkx as nx
-    import numpy as np
-    from cinnabar.stats import mle
 
     # 1) find all possible jsons
     json_fns = glob.glob(str(rootdir) + '/**/*json', recursive=True)

From 7d6887af25f15e18e1f9c0f3b52942b61228519b Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Mon, 24 Jul 2023 14:01:37 -0500
Subject: [PATCH 03/10] improve estimate/uncertainty rounding/reporting

---
 openfecli/commands/gather.py | 44 +++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index 9bb5d5edb..c98a9f1e9 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -5,6 +5,39 @@
 from openfecli import OFECommandPlugin
 import pathlib
 
+from typing import Tuple
+
+def _get_column(val):
+    import numpy as np
+    if val == 0:
+        return 0
+
+    log10 = np.log10(val)
+
+    if log10 >= 0.0:
+        col = np.floor(log10 + 1)
+    else:
+        col = np.floor(log10)
+    return int(col)
+
+def format_estimate_uncertainty(
+    est: float,
+    unc: float,
+    unc_prec: int = 2,
+) -> Tuple[str, str]:
+    import numpy as np
+    # get the last column needed for uncertainty
+    unc_col = _get_column(unc) - (unc_prec - 1)
+
+    if unc_col < 0:
+        est_str = f"{est:.{-unc_col}f}"
+        unc_str = f"{unc:.{-unc_col}f}"
+    else:
+        est_str = f"{np.round(est, -unc_col + 1)}"
+        unc_str = f"{np.round(unc, -unc_col + 1)}"
+
+    return est_str, unc_str
+
 
 def is_results_json(f):
     # sanity check on files before we try and deserialize
@@ -85,7 +118,9 @@ def _write_ddg(legs, output):
     for ligA, ligB, DDGbind, bind_unc, DDGhyd, hyd_unc in DDGs:
         name = f"{ligB}, {ligA}"
         if DDGbind is not None:
-            DDGbind, bind_unc = dp2(DDGbind), dp2(bind_unc)
+            DDGbind, bind_unc = format_estimate_uncertainty(DDGbind,
+                                                            bind_unc)
+            # DDGbind, bind_unc = dp2(DDGbind), dp2(bind_unc)
             output.write(f'DDGbind({name})\tRBFE\t{ligA}\t{ligB}'
                          f'\t{DDGbind}\t{bind_unc}\n')
         if DDGhyd is not None:
@@ -103,7 +138,7 @@ def _write_raw_dg(legs, output):
             if m is None:
                 m, u = 'NaN', 'NaN'
             else:
-                m, u = dp2(m.m), dp2(u.m)
+                m, u = format_estimate_uncertainty(m.m, u.m)
             output.write(f'DG{simtype}({name})\t{simtype}\t{ligpair[0]}\t'
                          f'{ligpair[1]}\t{m}\t{u}\n')
 
@@ -153,7 +188,7 @@ def _write_dg_mle(legs, output):
             MLEs.append((ligname, f, df))
 
     for ligA, DG, unc_DG in MLEs:
-        DG, unc_DG = dp2(DG), dp2(unc_DG)
+        DG, unc_DG = format_estimate_uncertainty(DG, unc_DG)
         output.write(f'DGbind({ligA})\tDG(MLE)\tZero\t{ligA}\t{DG}\t{unc_DG}\n')
 
 
@@ -269,3 +304,6 @@ def gather(rootdir, output, report):
     section='Quickrun Executor',
     requires_ofe=(0, 6),
 )
+
+if __name__ == "__main__":
+    gather()

From 5d181e5b2768071116e482e7ffeb8861280cbf81 Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Mon, 24 Jul 2023 14:33:53 -0500
Subject: [PATCH 04/10] update to simplify output tables

---
 openfecli/commands/gather.py | 53 +++++++++++++++---------------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index c98a9f1e9..ae05df4fa 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -113,25 +113,24 @@ def _get_ddgs(legs):
 
     return DDGs
 
-def _write_ddg(legs, output):
+def _write_ddg(legs, writer):
     DDGs = _get_ddgs(legs)
+    writer.writerow(["ligand_i", "ligand_j", "DDG(i->j) (kcal/mol)",
+                      "uncertainty (kcal/mol)"])
     for ligA, ligB, DDGbind, bind_unc, DDGhyd, hyd_unc in DDGs:
         name = f"{ligB}, {ligA}"
         if DDGbind is not None:
             DDGbind, bind_unc = format_estimate_uncertainty(DDGbind,
                                                             bind_unc)
-            # DDGbind, bind_unc = dp2(DDGbind), dp2(bind_unc)
-            output.write(f'DDGbind({name})\tRBFE\t{ligA}\t{ligB}'
-                         f'\t{DDGbind}\t{bind_unc}\n')
+            writer.writerow([ligA, ligB, DDGbind, bind_unc])
         if DDGhyd is not None:
-            DDGhyd, hyd_unc = dp2(DDGhyd), dp2(hyd_unc)
-            output.write(f'DDGhyd({name})\tRHFE\t{ligA}\t{ligB}\t'
-                         f'{DDGhyd}\t{hyd_unc}\n')
+            DDGhyd, hyd_unc = format_estimate_uncertainty(DDGbind,
+                                                          bind_unc)
+            writer.writerow([ligA, ligB, DDGhyd, hyd_unc])
 
-
-    ...
-
-def _write_raw_dg(legs, output):
+def _write_raw_dg(legs, writer):
+    writer.writerow(["leg", "ligand_i", "ligand_j", "DG(i->j) (kcal/mol)",
+                     "uncertainty (kcal/mol)"])
     for ligpair, vals in sorted(legs.items()):
         name = ', '.join(ligpair)
         for simtype, (m, u) in sorted(vals.items()):
@@ -139,10 +138,9 @@ def _write_raw_dg(legs, output):
                 m, u = 'NaN', 'NaN'
             else:
                 m, u = format_estimate_uncertainty(m.m, u.m)
-            output.write(f'DG{simtype}({name})\t{simtype}\t{ligpair[0]}\t'
-                         f'{ligpair[1]}\t{m}\t{u}\n')
+            writer.writerow([simtype, *ligpair, m, u])
 
-def _write_dg_mle(legs, output):
+def _write_dg_mle(legs, writer):
     import networkx as nx
     import numpy as np
     from cinnabar.stats import mle
@@ -187,19 +185,11 @@ def _write_dg_mle(legs, output):
             ligname = idx_to_nm[node]
             MLEs.append((ligname, f, df))
 
+    writer.writerow(["ligand", "DG(MLE) (kcal/mol)",
+                     "uncertainty (kcal/mol)"])
     for ligA, DG, unc_DG in MLEs:
         DG, unc_DG = format_estimate_uncertainty(DG, unc_DG)
-        output.write(f'DGbind({ligA})\tDG(MLE)\tZero\t{ligA}\t{DG}\t{unc_DG}\n')
-
-
-def dp2(v: float) -> str:
-    # turns 0.0012345 -> '0.0012', round() would get this wrong
-    import numpy as np
-    return np.format_float_positional(v, precision=2, trim='0',
-                                      fractional=False)
-
-
-
+        writer.writerow([ligA, DG, unc_DG])
 
 
 @click.command(
@@ -253,6 +243,7 @@ def gather(rootdir, output, report):
     """
     from collections import defaultdict
     import glob
+    import csv
 
     # 1) find all possible jsons
     json_fns = glob.glob(str(rootdir) + '/**/*json', recursive=True)
@@ -282,11 +273,11 @@ def gather(rootdir, output, report):
 
         legs[names][simtype] = result['estimate'], result['uncertainty']
 
-    # 4a for each ligand pair, resolve legs
-    DDGs = _get_ddgs(legs)
-
-    output.write('measurement\ttype\tligand_i\tligand_j\testimate (kcal/mol)'
-                 '\tuncertainty (kcal/mol)\n')
+    writer = csv.writer(
+        output,
+        delimiter="\t",
+        lineterminator="\n",  # to exactly reproduce previous, prefer "\r\n"
+    )
 
     # 5a) write out MLE values
     # 5b) write out DDG values
@@ -296,7 +287,7 @@ def gather(rootdir, output, report):
         'ddg': _write_ddg,
         'leg': _write_raw_dg,
     }[report.lower()]
-    writing_func(legs, output)
+    writing_func(legs, writer)
 
 
 PLUGIN = OFECommandPlugin(

From b3591f2c0ebff6550ae4862f203e92487d0024a1 Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Thu, 27 Jul 2023 14:17:08 -0500
Subject: [PATCH 05/10] use 1 digit in uncertainty; `leg` => `legs`

---
 openfecli/commands/gather.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index ae05df4fa..cfff0921a 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -23,7 +23,7 @@ def _get_column(val):
 def format_estimate_uncertainty(
     est: float,
     unc: float,
-    unc_prec: int = 2,
+    unc_prec: int = 1,
 ) -> Tuple[str, str]:
     import numpy as np
     # get the last column needed for uncertainty
@@ -202,7 +202,7 @@ def _write_dg_mle(legs, writer):
                 required=True)
 @click.option(
     '--report',
-    type=click.Choice(['dg', 'ddg', 'leg'], case_sensitive=False),
+    type=click.Choice(['dg', 'ddg', 'legs'], case_sensitive=False),
     default="dg", show_default=True,
     help=(
         "What data to report. 'dg' gives maximum-likelihood estimate of "
@@ -285,7 +285,7 @@ def gather(rootdir, output, report):
     writing_func = {
         'dg': _write_dg_mle,
         'ddg': _write_ddg,
-        'leg': _write_raw_dg,
+        'legs': _write_raw_dg,
     }[report.lower()]
     writing_func(legs, writer)
 

From 515ad62a451f3ede85113ebbe4563d5ad0266e2f Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Fri, 28 Jul 2023 13:18:20 -0500
Subject: [PATCH 06/10] Apply suggestions from code review

Co-authored-by: Irfan Alibay <IAlibay@users.noreply.github.com>
---
 openfecli/commands/gather.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index cfff0921a..c8bcfb322 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -20,6 +20,7 @@ def _get_column(val):
         col = np.floor(log10)
     return int(col)
 
+
 def format_estimate_uncertainty(
     est: float,
     unc: float,
@@ -113,6 +114,7 @@ def _get_ddgs(legs):
 
     return DDGs
 
+
 def _write_ddg(legs, writer):
     DDGs = _get_ddgs(legs)
     writer.writerow(["ligand_i", "ligand_j", "DDG(i->j) (kcal/mol)",
@@ -128,6 +130,7 @@ def _write_ddg(legs, writer):
                                                           bind_unc)
             writer.writerow([ligA, ligB, DDGhyd, hyd_unc])
 
+
 def _write_raw_dg(legs, writer):
     writer.writerow(["leg", "ligand_i", "ligand_j", "DG(i->j) (kcal/mol)",
                      "uncertainty (kcal/mol)"])
@@ -140,6 +143,7 @@ def _write_raw_dg(legs, writer):
                 m, u = format_estimate_uncertainty(m.m, u.m)
             writer.writerow([simtype, *ligpair, m, u])
 
+
 def _write_dg_mle(legs, writer):
     import networkx as nx
     import numpy as np

From 16b2a344fd44b8e360c5ee73d00e7625fd088ef8 Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Mon, 31 Jul 2023 10:58:56 -0500
Subject: [PATCH 07/10] Tuple => tuple

---
 openfecli/commands/gather.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index cfff0921a..af7501600 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -5,7 +5,6 @@
 from openfecli import OFECommandPlugin
 import pathlib
 
-from typing import Tuple
 
 def _get_column(val):
     import numpy as np
@@ -24,7 +23,7 @@ def format_estimate_uncertainty(
     est: float,
     unc: float,
     unc_prec: int = 1,
-) -> Tuple[str, str]:
+) -> tuple[str, str]:
     import numpy as np
     # get the last column needed for uncertainty
     unc_col = _get_column(unc) - (unc_prec - 1)

From cc2fecae95fe9e853e63985b5bde5b2cb2f7188d Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Tue, 1 Aug 2023 11:27:41 -0500
Subject: [PATCH 08/10] add tests for uncertainty calc stuff

---
 openfecli/tests/commands/test_gather.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/openfecli/tests/commands/test_gather.py b/openfecli/tests/commands/test_gather.py
index afafc4e29..68f7b6475 100644
--- a/openfecli/tests/commands/test_gather.py
+++ b/openfecli/tests/commands/test_gather.py
@@ -4,7 +4,24 @@
 import tarfile
 import pytest
 
-from openfecli.commands.gather import gather
+from openfecli.commands.gather import (
+    gather, format_estimate_uncertainty, _get_column
+)
+
+@pytest.mark.parametrize('est,unc,unc_prec,est_str,unc_str', [
+    (12.432, 0.111, 2, "12.43", "0.11"),
+    (0.9999, 0.01, 2, "1.000", "0.010"),
+    (1234, 100, 2, "1230", "100"),
+])
+def test_format_estimate_uncertainty(est, unc, unc_prec, est_str, unc_str):
+    assert format_estimate_uncertainty(est, unc, unc_prec) == (est_str, unc_str)
+
+@pytest.mark.parametrize('val, col', [
+    (1.0, 1), (0.1, -1), (-0.0, 0), (0.0, 0), (0.2, -1), (0.9, -1),
+    (0.011, -2), (9, 1), (10, 2), (15, 2),
+])
+def test_get_column(val, col):
+    assert _get_column(val) == col
 
 
 @pytest.fixture

From bf0661a21431c2de67e8229790a045bdfedf98a5 Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Tue, 1 Aug 2023 12:13:44 -0500
Subject: [PATCH 09/10] Add HyphenAwareChoice

---
 openfecli/clicktypes/__init__.py              |  1 +
 openfecli/clicktypes/hyphenchoice.py          | 13 +++++++++++++
 openfecli/commands/gather.py                  |  3 ++-
 .../tests/clicktypes/test_hyphenchoice.py     | 19 +++++++++++++++++++
 4 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 openfecli/clicktypes/__init__.py
 create mode 100644 openfecli/clicktypes/hyphenchoice.py
 create mode 100644 openfecli/tests/clicktypes/test_hyphenchoice.py

diff --git a/openfecli/clicktypes/__init__.py b/openfecli/clicktypes/__init__.py
new file mode 100644
index 000000000..4d9120e0e
--- /dev/null
+++ b/openfecli/clicktypes/__init__.py
@@ -0,0 +1 @@
+from hyphenchoice import HyphenAwareChoice
diff --git a/openfecli/clicktypes/hyphenchoice.py b/openfecli/clicktypes/hyphenchoice.py
new file mode 100644
index 000000000..70ac77501
--- /dev/null
+++ b/openfecli/clicktypes/hyphenchoice.py
@@ -0,0 +1,13 @@
+import click
+
+def _normalize_to_hyphen(string):
+    return string.replace("_", "-")
+
+class HyphenAwareChoice(click.Choice):
+    def __init__(self, choices, case_sensitive=True):
+        choices = [_normalize_to_hyphen(choice) for choice in choices]
+        super().__init__(choices, case_sensitive)
+
+    def convert(self, value, param, ctx):
+        value = _normalize_to_hyphen(value)
+        return super().convert(value, param, ctx)
diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index 6633783ac..ac3ee9ce7 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -3,6 +3,7 @@
 
 import click
 from openfecli import OFECommandPlugin
+from openfecli.clicktypes import HyphenAwareChoice
 import pathlib
 
 
@@ -205,7 +206,7 @@ def _write_dg_mle(legs, writer):
                 required=True)
 @click.option(
     '--report',
-    type=click.Choice(['dg', 'ddg', 'legs'], case_sensitive=False),
+    type=HyphenAwareChoice(['dg', 'ddg', 'dg-raw'], case_sensitive=False),
     default="dg", show_default=True,
     help=(
         "What data to report. 'dg' gives maximum-likelihood estimate of "
diff --git a/openfecli/tests/clicktypes/test_hyphenchoice.py b/openfecli/tests/clicktypes/test_hyphenchoice.py
new file mode 100644
index 000000000..c7c0bb5a2
--- /dev/null
+++ b/openfecli/tests/clicktypes/test_hyphenchoice.py
@@ -0,0 +1,19 @@
+import pytest
+import click
+from openfecli.clicktypes.hyphenchoice import HyphenAwareChoice
+
+class TestHyphenAwareChoice:
+    @pytest.mark.parametrize('value', [
+        "foo_bar_baz", "foo_bar-baz", "foo-bar_baz", "foo-bar-baz"
+    ])
+    def test_init(self, value):
+        ch = HyphenAwareChoice([value])
+        assert ch.choices == ["foo-bar-baz"]
+
+    @pytest.mark.parametrize('value', [
+        "foo_bar_baz", "foo_bar-baz", "foo-bar_baz", "foo-bar-baz"
+    ])
+    def test_convert(self, value):
+        ch = HyphenAwareChoice(['foo-bar-baz'])
+        # counting on __call__ to get to convert()
+        assert ch(value) == "foo-bar-baz"

From d13e17530710d02a1b0ed8598fe464fffeb5a6db Mon Sep 17 00:00:00 2001
From: "David W.H. Swenson" <david.swenson@omsf.io>
Date: Tue, 1 Aug 2023 13:25:41 -0500
Subject: [PATCH 10/10] clean up a bit of gather

---
 openfecli/clicktypes/__init__.py |  2 +-
 openfecli/commands/gather.py     | 44 ++++++++++++++------------------
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/openfecli/clicktypes/__init__.py b/openfecli/clicktypes/__init__.py
index 4d9120e0e..29b5b1291 100644
--- a/openfecli/clicktypes/__init__.py
+++ b/openfecli/clicktypes/__init__.py
@@ -1 +1 @@
-from hyphenchoice import HyphenAwareChoice
+from .hyphenchoice import HyphenAwareChoice
diff --git a/openfecli/commands/gather.py b/openfecli/commands/gather.py
index ac3ee9ce7..7b9e466e0 100644
--- a/openfecli/commands/gather.py
+++ b/openfecli/commands/gather.py
@@ -131,7 +131,7 @@ def _write_ddg(legs, writer):
             writer.writerow([ligA, ligB, DDGhyd, hyd_unc])
 
 
-def _write_raw_dg(legs, writer):
+def _write_dg_raw(legs, writer):
     writer.writerow(["leg", "ligand_i", "ligand_j", "DG(i->j) (kcal/mol)",
                      "uncertainty (kcal/mol)"])
     for ligpair, vals in sorted(legs.items()):
@@ -210,8 +210,8 @@ def _write_dg_mle(legs, writer):
     default="dg", show_default=True,
     help=(
         "What data to report. 'dg' gives maximum-likelihood estimate of "
-        "asbolute deltaG,  'ddg' gives delta-delta-G, and 'leg' gives the "
-        "raw result of the deltaG for a leg."
+        "absolute deltaG,  'ddg' gives delta-delta-G, and 'dg-raw' gives "
+        "the raw result of the deltaG for a leg."
     )
 )
 @click.option('output', '-o',
@@ -220,30 +220,24 @@ def _write_dg_mle(legs, writer):
 def gather(rootdir, output, report):
     """Gather simulation result jsons of relative calculations to a tsv file
 
-    Will walk ROOTDIR recursively and find all results files ending in .json
-    (i.e those produced by the quickrun command).  Each of these contains the
-    results of a separate leg from a relative free energy thermodynamic cycle.
+    This walks ROOTDIR recursively and finds all result JSON files from the
+    quickrun command (these files must end in .json). Each of these contains
+    the results of a separate leg from a relative free energy thermodynamic
+    cycle.
 
-    Paired legs of simulations will be combined to give the DDG values between
-    two ligands in the corresponding phase, producing either binding ('DDGbind')
-    or hydration ('DDGhyd') relative free energies.  These will be reported as
-    'DDGbind(B,A)' meaning DGbind(B) - DGbind(A), the difference in free energy
-    of binding for ligand B relative to ligand A.
-
-    Individual leg results will be also be written.  These are reported as
-    either DGvacuum(A,B) DGsolvent(A,B) or DGcomplex(A,B) for the vacuum,
-    solvent or complex free energy of transmuting ligand A to ligand B.
+    The results reported depend on ``--report`` flag:
 
     \b
-    Will produce a **tab** separated file with 6 columns:
-    1) a description of the measurement, for example DDGhyd(A, B)
-    2) the type of this measurement, either RBFE or RHFE
-    3) the identifier of the first ligand
-    4) the identifier of the second ligand
-    5) the estimated value (in kcal/mol)
-    6) the uncertainty on the value (also kcal/mol)
-
-    By default, outputs to stdout, use -o option to choose file.
+    * 'dg' (default) reports the ligand and the results are the maximum
+      likelihood estimate of its absolute free, and the uncertainty in
+      that.
+    * 'ddg' reports pairs of ligand_i and ligand_j, the calculated
+      relative free energy DDG(i->j) = DG(j) - DG(i) and its uncertainty
+    * 'dg-raw' reports the raw results, giving the leg (vacuum, solvent, or
+      complex), ligand_i, ligand_j, the raw DG(i->j) associated with it.
+
+    The output is a table of **tab** separated values. By default, this
+    outputs to stdout, use the -o option to choose an output file.
     """
     from collections import defaultdict
     import glob
@@ -289,7 +283,7 @@ def gather(rootdir, output, report):
     writing_func = {
         'dg': _write_dg_mle,
         'ddg': _write_ddg,
-        'legs': _write_raw_dg,
+        'dg-raw': _write_dg_raw,
     }[report.lower()]
     writing_func(legs, writer)