valmi-io · saurav-malani · Apr 17, 2024 · Apr 17, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/config.yaml b/config.yaml
@@ -70,3 +70,5 @@ CONNECTOR_RUN_CONFIG:
     records_per_metric: 100
   POSTGRES:
     records_per_metric: 100
+  SHOPIFY:
+    chunk_size: 300
diff --git a/packages/valmi-connector-lib/.vscode/settings.json b/packages/valmi-connector-lib/.vscode/settings.json
@@ -12,8 +12,8 @@
   "[python]": {
     "editor.formatOnSave": true,
     "editor.codeActionsOnSave": {
-      "source.organizeImports": true,
-      "source.fixAll": true
+      "source.organizeImports": "explicit",
+      "source.fixAll": "explicit"
     }
   }
 }
diff --git a/packages/valmi-connector-lib/pyproject.toml b/packages/valmi-connector-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "valmi-connector-lib"
-version = "0.1.113"
+version = "0.1.159"
 description = ""
 authors = ["Rajashekar Varkala <[email protected]>"]
 readme = "README.md"
@@ -9,7 +9,7 @@ packages = [{include = "valmi_connector_lib"}]
 [tool.poetry.dependencies]
 python = "^3.9"
 requests = "^2.30.0"
-pydantic = "1.9.2"
+pydantic = "^1.9.2"
 valmi-airbyte-cdk = "^0.30.3"
 
 [tool.poetry.group.dev.dependencies]
@@ -49,7 +49,7 @@ max-complexity = 10
 max-line-length = 120
 
 [build-system]
-requires = ["poetry-core"]
+requires = ["poetry-core", "cython<3.0"]
 build-backend = "poetry.core.masonry.api"
 
 # Example configuration for Black.

diff --git a/...mi-connector-lib/valmi_connector_lib/destination_wrapper/destination_container_wrapper.py b/...mi-connector-lib/valmi_connector_lib/destination_wrapper/destination_container_wrapper.py
@@ -31,6 +31,7 @@
 from typing import Any, Dict
 
 from valmi_connector_lib.common.logs import SingletonLogWriter, TimeAndChunkEndFlushPolicy
+from valmi_connector_lib.common.samples import SampleWriter
 from valmi_connector_lib.destination_wrapper.engine import CONNECTOR_STRING
 
 from .proc_stdout_handler import ProcStdoutHandlerThread
@@ -53,7 +54,7 @@
 def get_airbyte_command():
     entrypoint_str = os.environ["VALMI_ENTRYPOINT"]
     entrypoint = entrypoint_str.split(" ")
-
+    
     airbyte_command = sys.argv[3]
     for i, arg in enumerate(sys.argv[1:]):
         if i >= len(entrypoint):
@@ -165,30 +166,32 @@ def main():
                            engine.connector_state.run_time_args["sync_id"],
                            engine.connector_state.run_time_args["run_id"],
                            CONNECTOR_STRING)
-
-        #initialize SampleWriter
-        SampleWriter.get_writer_by_metric_type(store_config_str=os.environ["VALMI_INTERMEDIATE_STORE"],
-                                                sync_id=engine.connector_state.run_time_args["sync_id"],
-                                                run_id=engine.connector_state.run_time_args["run_id"],
-                                                connector=CONNECTOR_STRING)
 
-        # initialize handler
-        for key in handlers.keys():
-            handlers[key] = handlers[key](engine=engine, store_writer=None, stdout_writer=None)
+        # initialize SampleWriter
+        SampleWriter.get_writer_by_metric_type(store_config_str=os.environ["VALMI_INTERMEDIATE_STORE"],
+                                               sync_id=engine.connector_state.run_time_args["sync_id"],
+                                               run_id=engine.connector_state.run_time_args["run_id"],
+                                               connector=CONNECTOR_STRING)
 
         global loaded_state
         store_reader = StoreReader(engine=engine, state=loaded_state)
 
+        # initialize handler
+        for key in handlers.keys():
+            handlers[key] = handlers[key](engine=engine, store_writer=None,
+                                          stdout_writer=None, store_reader=store_reader)
+
         # create the subprocess
         subprocess_args = sys.argv[1:]
 
-        # HACK: Remove destination_catalog command line argument when working with etl destination
+        # For ETL, there is no concept of destination catalog
         if os.environ.get('MODE', 'any') == 'etl' and "--destination_catalog" in subprocess_args:
             arg_idx = subprocess_args.index("--destination_catalog")
             subprocess_args.remove("--destination_catalog")
             subprocess_args.pop(arg_idx)
 
-        if is_state_available():
+        # For ETL, internal connectors do not need any state information  
+        if os.environ.get('MODE', 'any') != 'etl' and is_state_available():
             subprocess_args.append("--state")
             subprocess_args.append(state_file_path)
         proc = subprocess.Popen(subprocess_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
@@ -202,8 +205,6 @@ def main():
             record_types = handlers.keys()
 
             for line in store_reader.read():
-                print("Reading")
-                print(line)
                 if line.strip() == "":
                     continue
                 json_record = json.loads(line)

diff --git a/.../valmi-connector-lib/valmi_connector_lib/destination_wrapper/destination_write_wrapper.py b/.../valmi-connector-lib/valmi_connector_lib/destination_wrapper/destination_write_wrapper.py
@@ -53,10 +53,9 @@ def finalise_message_handling(self) -> HandlerResponseData:
 
     def read_chunk_id_checkpoint(self):
         if self.previous_state is not None \
-                and 'state' in self.previous_state \
-                and 'data' in self.previous_state['state'] \
-                and 'chunk_id' in self.previous_state['state']['data']:
-            return self.previous_state['state']['data']['chunk_id'] + 1
+                and '_valmi_meta' in self.previous_state \
+                and 'chunk_id' in self.previous_state['_valmi_meta']:
+            return self.previous_state['_valmi_meta']['chunk_id'] + 1
         return 1
 
     def start_message_handling(self, input_messages: Iterable[AirbyteMessage]) -> AirbyteMessage:

diff --git a/packages/valmi-connector-lib/valmi_connector_lib/destination_wrapper/engine.py b/packages/valmi-connector-lib/valmi_connector_lib/destination_wrapper/engine.py
@@ -110,7 +110,8 @@ def __init__(self, *args, **kwargs):
         self.connector_state = ConnectorState(run_time_args=run_time_args)
 
     def current_run_details(self):
-        sync_id = du(os.environ.get("DAGSTER_RUN_JOB_NAME", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"))
+        sync_id = du(os.environ.get("DAGSTER_RUN_JOB_NAME", "cf280e5c-1184-4052-b089-f9f41b25138e"))
+        # sync_id = du(os.environ.get("DAGSTER_RUN_JOB_NAME", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"))
         r = self.session_with_retries.get(
             f"{self.engine_url}/syncs/{sync_id}/runs/current_run_details/{CONNECTOR_STRING}",
             timeout=HTTP_TIMEOUT,
@@ -198,7 +199,7 @@ def checkpoint(self, state):
         sync_id = self.connector_state.run_time_args["sync_id"]
         run_id = self.connector_state.run_time_args["run_id"]
         r = self.session_with_retries.post(
-            f"{self.engine_url}/syncs/{sync_id}/runs/{run_id}/state/{CONNECTOR_STRING}/",
+            f"{self.engine_url}/syncs/{sync_id}/runs/{run_id}/state/{CONNECTOR_STRING}/{os.environ.get('MODE', 'any')}",
             timeout=HTTP_TIMEOUT,
             json=state,
         )

diff --git a/...valmi-connector-lib/valmi_connector_lib/destination_wrapper/proc_stdout_event_handlers.py b/...valmi-connector-lib/valmi_connector_lib/destination_wrapper/proc_stdout_event_handlers.py
@@ -58,13 +58,17 @@ def __init__(self, engine: NullEngine, state: str) -> None:
         self.engine = engine
         self.connector_state: ConnectorState = self.engine.connector_state
         self.loaded_state = state
+        self.current_file_name = None
 
         store_config = json.loads(os.environ["VALMI_INTERMEDIATE_STORE"])
         if store_config["provider"] == "local":
             path_name = join(store_config["local"]["directory"], self.connector_state.run_time_args["run_id"], "data")
             os.makedirs(path_name, exist_ok=True)
             self.path_name = path_name
-            self.last_handled_fn = self.get_file_name_from_chunk_id(self.read_chunk_id_checkpoint())
+            if os.environ.get('MODE', 'any') == 'etl':
+                self.last_handled_fn = self.read_file_marker_from_checkpoint()
+            else:
+                self.last_handled_fn = self.get_file_name_from_chunk_id(self.read_chunk_id_checkpoint())
 
     def read(self):
         while True:
@@ -76,6 +80,7 @@ def read(self):
                 if self.last_handled_fn is not None and int(fn[:-5]) <= int(self.last_handled_fn[:-5]):
                     continue
                 if fn.endswith(".vald"):
+                    self.current_file_name = fn
                     with open(join(self.path_name, fn), "r") as f:
                         for line in f.readlines():
                             # print("yiedling", line)
@@ -101,10 +106,16 @@ def read(self):
     def read_chunk_id_checkpoint(self):
         # TODO: connector_state is not being used for destination, clean it up.
         if self.loaded_state is not None \
-                and 'state' in self.loaded_state \
-                and 'data' in self.loaded_state['state'] \
-                and 'chunk_id' in self.loaded_state['state']['data']:
-            return self.loaded_state['state']['data']['chunk_id']
+                and '_valmi_meta' in self.loaded_state \
+                and 'chunk_id' in self.loaded_state['_valmi_meta']:
+            return self.loaded_state['_valmi_meta']['chunk_id']
+        return None
+
+    def read_file_marker_from_checkpoint(self):
+        if self.loaded_state is not None \
+                and '_valmi_meta' in self.loaded_state \
+                and 'file_marker' in self.loaded_state['_valmi_meta']:
+            return self.loaded_state["_valmi_meta"]["file_marker"]
         return None
 
     def get_file_name_from_chunk_id(self, chunk_id):
@@ -154,30 +165,33 @@ def handle(self, record) -> bool:
         print("Checkpoint seen")
         print(record)
 
-        records_delivered = record["state"]["data"]["records_delivered"]
-        finished = record["state"]["data"]["finished"]
-        commit_state = record["state"]["data"]["commit_state"]
-        commit_metric = record["state"]["data"]["commit_metric"]
-
-        total_records = 0
-        for k, v in records_delivered.items():
-            total_records += v
-
-        self.engine.connector_state.register_records(total_records)
-
-        if commit_metric:
-            self.engine.metric_ext(records_delivered, record["state"]["data"]["chunk_id"], commit=True)
-            # self.engine.connector_state.register_chunk()
-        if commit_state:
-            self.engine.checkpoint(record)
-            if SingletonLogWriter.instance() is not None:
-                SingletonLogWriter.instance().data_chunk_flush_callback()
-            SampleWriter.data_chunk_flush_callback()
-        else:
-            if SingletonLogWriter.instance() is not None:
-                SingletonLogWriter.instance().check_for_flush()
-
-        return True
+        if os.environ.get('MODE', 'any') == 'etl':
+            return True
+        else :
+            records_delivered = record["state"]["data"]["records_delivered"]
+            finished = record["state"]["data"]["finished"]
+            commit_state = record["state"]["data"]["commit_state"]
+            commit_metric = record["state"]["data"]["commit_metric"]
+
+            total_records = 0
+            for k, v in records_delivered.items():
+                total_records += v
+
+            self.engine.connector_state.register_records(total_records)
+
+            if commit_metric:
+                self.engine.metric_ext(records_delivered, record["state"]["data"]["chunk_id"], commit=True)
+                # self.engine.connector_state.register_chunk()
+            if commit_state:
+                self.engine.checkpoint(record["state"])
+                if SingletonLogWriter.instance() is not None:
+                    SingletonLogWriter.instance().data_chunk_flush_callback()
+                SampleWriter.data_chunk_flush_callback()
+            else:
+                if SingletonLogWriter.instance() is not None:
+                    SingletonLogWriter.instance().check_for_flush()
+
+            return True
 
 
 class RecordHandler(DefaultHandler):

diff --git a/packages/valmi-connector-lib/valmi_connector_lib/destination_wrapper/proc_stdout_handler.py b/packages/valmi-connector-lib/valmi_connector_lib/destination_wrapper/proc_stdout_handler.py
@@ -78,7 +78,7 @@ def run(self) -> None:
                         ret_val = handlers[json_record["type"]].handle(json_record)
                         if ret_val is False:  # TODO: comes from ERROR Trace, should be handled cleanly
                             self.proc.kill()
-                            os._exit(0)  # error is already logged with engine in the handler
+                            os._exit(1)  # error is already logged with engine in the handler
 
                 # stdout finished. clean close
                 self.exit_flag = True

diff --git a/packages/valmi-connector-lib/valmi_connector_lib/destination_wrapper/read_handlers.py b/packages/valmi-connector-lib/valmi_connector_lib/destination_wrapper/read_handlers.py
@@ -29,6 +29,7 @@
     StoreReader,
     StdoutWriter,
 )
+import os
 
 
 class ReadDefaultHandler:
@@ -62,7 +63,12 @@ def __init__(self, *args, **kwargs):
         super(ReadCheckpointHandler, self).__init__(*args, **kwargs)
 
     def handle(self, record) -> bool:
-        # do an engine call to proceed.
+        # For ETL, we store the checkpoint for the reader instead of the destination stdout state, 
+        # because state is dictated by the source.
+        if os.environ.get('MODE', 'any') == 'etl':
+            _valmi_meta = {"file_marker": self.store_reader.current_file_name}
+            record["state"]["_valmi_meta"] = _valmi_meta
+            self.engine.checkpoint(record["state"])
         return True