refactor chain and ucsc.net sniffing

properly ignore comments do not read the dataset unbounded, I think we have enough info from the headers and the first data line
galaxyproject · May 24, 2024 · cae34c8 · cae34c8
1 parent 7819c9a
commit cae34c8
Showing 1 changed file with 42 additions and 68 deletions.
diff --git a/lib/galaxy/datatypes/chain.py b/lib/galaxy/datatypes/chain.py
@@ -6,10 +6,7 @@
 
 from galaxy.datatypes.metadata import MetadataElement
 from galaxy.datatypes.protocols import DatasetProtocol
-from galaxy.datatypes.sniff import (
-    build_sniff_from_prefix,
-    FilePrefix,
-)
+from galaxy.datatypes.sniff import build_sniff_from_prefix, FilePrefix, get_headers
 from galaxy.util import (
     commaify,
     compression_utils,
@@ -91,41 +88,31 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
         >>> fname = get_test_fname( '1.chain' )
         >>> Chain().sniff( fname )
         True
+        >>> fname = get_test_fname( '2.chain' )
+        >>> Chain().sniff( fname )
+        True
         >>>
         """
-        fh = file_prefix.string_io()
-        for line in fh:
-            line = line.strip()
-            if line:  # first non-empty line
-                if line.startswith("chain"):
-                    tokens = line.split()
-                    if not (
-                        len(tokens) in [12, 13]
-                        and tokens[4] in self.strands
-                        and tokens[9] in self.strands
-                        and tokens[3].isdecimal()
-                        and tokens[5].isdecimal()
-                        and tokens[6].isdecimal()
-                    ):
-                        return False
-                    prior_token_len = 0
-                    for line in fh:
-                        line = line.strip()
-                        if line == "":
-                            break
-                        tokens = line.split()
-                        if prior_token_len == 1:
-                            return False
-                        if len(tokens) not in [1, 3]:
-                            return False
-                        if not all(token.isdecimal() for token in tokens):
-                            return False
-                        prior_token_len = len(tokens)
-                    if prior_token_len == 1:
-                        return True
-                else:
-                    return False
-        return False
+        headers = get_headers(file_prefix, None, count=2, comment_designator="#")
+        if not (
+            len(headers) == 2
+            and len(headers[0]) in [12, 13]
+            and headers[0][0] == "chain"
+            and headers[0][1].isdecimal()
+            and headers[0][3].isdecimal()
+            and headers[0][4] in self.strands
+            and headers[0][5].isdecimal()
+            and headers[0][6].isdecimal()
+            and headers[0][8].isdecimal()
+            and headers[0][9] in self.strands
+            and headers[0][10].isdecimal()
+            and headers[0][11].isdecimal()
+            and headers[1][0].isdecimal()
+            and len(headers[1]) in [1, 3]
+        ):
+            return False
+        else:
+            return True
 
 
 @build_sniff_from_prefix
@@ -161,34 +148,21 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
         allowed_classes = ["fill", "gap"]
         strands = ["+", "-"]
 
-        fh = file_prefix.string_io()
-        for line in fh:
-            line = line.strip()
-            if line:  # first non-empty line
-                if line.startswith("net"):
-                    tokens = line.split()
-                    if not (len(tokens) == 3 and tokens[2].isdecimal()):
-                        return False
-                    for line in fh:
-                        if line[0] != " ":  # children are indented one space
-                            return False
-                        line = line.strip()
-                        if line == "":
-                            break
-                        tokens = line.split()
-                        if not (
-                            len(tokens) >= 7  # seven fixed fields
-                            and len(tokens) <= 41  # plus seventeen optional name/value pairs
-                            and tokens[0] in allowed_classes
-                            and tokens[1].isdecimal()
-                            and tokens[2].isdecimal()
-                            and tokens[4] in strands
-                            and tokens[5].isdecimal()
-                            and tokens[6].isdecimal()
-                        ):
-                            return False
-                        else:
-                            return True
-                else:
-                    return False
-        return False
+        headers = get_headers(file_prefix, None, count=2, comment_designator="#")
+        if not (
+            len(headers) == 2
+            and len(headers[0]) == 3
+            and headers[0][0] == "net"
+            and headers[0][2].isdecimal()
+            and len(headers[1]) >= 7  # seven fixed fields
+            and len(headers[1]) <= 41  # plus seventeen optional name/value pairs
+            and headers[1][0] in allowed_classes
+            and headers[1][1].isdecimal()
+            and headers[1][2].isdecimal()
+            and headers[1][4] in strands
+            and headers[1][5].isdecimal()
+            and headers[1][6].isdecimal()
+        ):
+            return False
+        else:
+            return True