feat(integrations): Add llama_parse integration for document parsing. (…

…#823)  > [!IMPORTANT] > Add Llama document parsing integration with new classes, configuration updates, and tests. > > - **Integration**: > - Add `LlamaParseIntegrationDef` and `LlamaParseIntegrationDefUpdate` classes in `Tools.py` for Llama document parsing. > - Add `LlamaParseFetchArguments`, `LlamaParseFetchArgumentsUpdate`, `LlamaParseSetup`, and `LlamaParseSetupUpdate` classes for handling setup and arguments. > - Implement `parse()` function in `llama_parse.py` for parsing documents using LlamaParse. > - **Configuration**: > - Add `llama-index` and `llama-parse` dependencies to `pyproject.toml`. > - **Testing**: > - Add `MockLlamaParseClient` in `mocks/llama_parse.py` for testing. > - Add `test_llama_parse_provider()` in `test_providers.py` to test LlamaParse provider configuration. > - Update `conftest.py` to include LlamaParse in mocked services. > > <sup>This description was created by </sup>[<img alt="Ellipsis" src="https://img.shields.io/badge/Ellipsis-blue?color=175173">](https://www.ellipsis.dev?ref=julep-ai%2Fjulep&utm_source=github&utm_medium=referral)<sup> for 239e6c7. It will automatically update as commits are pushed.</sup>  --------- Co-authored-by: Vedantsahai18 <[email protected]>
julep-ai · Nov 10, 2024 · b6308f1 · b6308f1
1 parent 1ded44f
commit b6308f1
Show file tree

Hide file tree

Showing 21 changed files with 1,544 additions and 14 deletions.
diff --git a/agents-api/agents_api/activities/execute_integration.py b/agents-api/agents_api/activities/execute_integration.py
@@ -11,7 +11,7 @@
 from ..models.tools import get_tool_args_from_metadata
 
 
-@auto_blob_store
+@auto_blob_store(deep=True)
 @beartype
 async def execute_integration(
     context: StepContext,

diff --git a/agents-api/agents_api/activities/task_steps/base_evaluate.py b/agents-api/agents_api/activities/task_steps/base_evaluate.py
@@ -1,9 +1,14 @@
 import ast
 from typing import Any
 
+import simpleeval
 from beartype import beartype
 from box import Box
 from openai import BaseModel
+
+# Increase the max string length to 300000
+simpleeval.MAX_STRING_LENGTH = 300000
+
 from simpleeval import NameNotDefined, SimpleEval
 from temporalio import activity
 from thefuzz import fuzz

diff --git a/agents-api/agents_api/autogen/Tools.py b/agents-api/agents_api/autogen/Tools.py
@@ -12,6 +12,7 @@
     BaseModel,
     ConfigDict,
     Field,
+    RootModel,
     StrictBool,
 )
 
@@ -50,6 +51,10 @@ class ApiCallDef(BaseModel):
     """
     The data to send as form data
     """
+    files: dict[str, Any] | None = None
+    """
+    The data to send as files data
+    """
     json_: Annotated[dict[str, Any] | None, Field(alias="json")] = None
     """
     JSON body to send with the request
@@ -117,6 +122,10 @@ class ApiCallDefUpdate(BaseModel):
     """
     The data to send as form data
     """
+    files: dict[str, Any] | None = None
+    """
+    The data to send as files data
+    """
     json_: Annotated[dict[str, Any] | None, Field(alias="json")] = None
     """
     JSON body to send with the request
@@ -189,6 +198,7 @@ class BaseIntegrationDef(BaseModel):
         "browserbase",
         "email",
         "remote_browser",
+        "llama_parse",
     ]
     """
     The provider of the integration
@@ -225,6 +235,7 @@ class BaseIntegrationDefUpdate(BaseModel):
             "browserbase",
             "email",
             "remote_browser",
+            "llama_parse",
         ]
         | None
     ) = None
@@ -720,6 +731,7 @@ class CreateToolRequest(BaseModel):
         | BrowserbaseGetSessionLiveUrlsIntegrationDef
         | BrowserbaseGetSessionConnectUrlIntegrationDef
         | RemoteBrowserIntegrationDef
+        | LlamaParseIntegrationDef
         | None
     ) = None
     """
@@ -947,6 +959,154 @@ class FunctionDef(BaseModel):
     """
 
 
+class LlamaParseFetchArguments(BaseModel):
+    """
+    Arguments for LlamaParse integration
+    """
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+    result_format: Literal["text", "markdown"] = "text"
+    """
+    The format of the result. Can be "text" or "markdown". Default is "text".
+    """
+    num_workers: Annotated[int, Field(ge=1, le=10)] = 2
+    """
+    Number of workers for parallel processing. Default is 2, but can be set between 1 and 10.
+    """
+    verbose: StrictBool = True
+    """
+    Verbose mode for detailed logging. Default is true.
+    """
+    language: str = "en"
+    """
+    Language of the text. Default is English.
+    """
+    filename: str | None = None
+    """
+    File Name. If not provided, a random name will be generated.
+    """
+    file: str
+    """
+    The base64 string of the file
+    """
+
+
+class LlamaParseFetchArgumentsUpdate(BaseModel):
+    """
+    Arguments for LlamaParse integration
+    """
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+    result_format: Literal["text", "markdown"] = "text"
+    """
+    The format of the result. Can be "text" or "markdown". Default is "text".
+    """
+    num_workers: Annotated[int, Field(ge=1, le=10)] = 2
+    """
+    Number of workers for parallel processing. Default is 2, but can be set between 1 and 10.
+    """
+    verbose: StrictBool = True
+    """
+    Verbose mode for detailed logging. Default is true.
+    """
+    language: str = "en"
+    """
+    Language of the text. Default is English.
+    """
+    filename: str | None = None
+    """
+    File Name. If not provided, a random name will be generated.
+    """
+    file: str | None = None
+    """
+    The base64 string of the file
+    """
+
+
+class LlamaParseIntegrationDef(BaseIntegrationDef):
+    """
+    LlamaParse integration definition
+    """
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+    provider: Literal["llama_parse"] = "llama_parse"
+    """
+    The provider must be "LlamaParseSetup"
+    """
+    method: str | None = None
+    """
+    The specific method of the integration to call
+    """
+    setup: LlamaParseSetup | None = None
+    """
+    The setup parameters for LlamaParse
+    """
+    arguments: LlamaParseFetchArguments | None = None
+    """
+    The arguments for LlamaParse
+    """
+
+
+class LlamaParseIntegrationDefUpdate(BaseIntegrationDefUpdate):
+    """
+    LlamaParse integration definition
+    """
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+    provider: Literal["llama_parse"] = "llama_parse"
+    """
+    The provider must be "LlamaParseSetup"
+    """
+    method: str | None = None
+    """
+    The specific method of the integration to call
+    """
+    setup: LlamaParseSetupUpdate | None = None
+    """
+    The setup parameters for LlamaParse
+    """
+    arguments: LlamaParseFetchArgumentsUpdate | None = None
+    """
+    The arguments for LlamaParse
+    """
+
+
+class LlamaParseSetup(BaseModel):
+    """
+    Setup parameters for LlamaParse integration
+    """
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+    llamaparse_api_key: str
+    """
+    The API key for LlamaParse
+    """
+
+
+class LlamaParseSetupUpdate(BaseModel):
+    """
+    Setup parameters for LlamaParse integration
+    """
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+    llamaparse_api_key: str | None = None
+    """
+    The API key for LlamaParse
+    """
+
+
 class NamedToolChoice(BaseModel):
     model_config = ConfigDict(
         populate_by_name=True,
@@ -1005,6 +1165,7 @@ class PatchToolRequest(BaseModel):
         | BrowserbaseGetSessionLiveUrlsIntegrationDefUpdate
         | BrowserbaseGetSessionConnectUrlIntegrationDefUpdate
         | RemoteBrowserIntegrationDefUpdate
+        | LlamaParseIntegrationDefUpdate
         | None
     ) = None
     """
@@ -1428,6 +1589,7 @@ class Tool(BaseModel):
         | BrowserbaseGetSessionLiveUrlsIntegrationDef
         | BrowserbaseGetSessionConnectUrlIntegrationDef
         | RemoteBrowserIntegrationDef
+        | LlamaParseIntegrationDef
         | None
     ) = None
     """
@@ -1517,6 +1679,7 @@ class UpdateToolRequest(BaseModel):
         | BrowserbaseGetSessionLiveUrlsIntegrationDef
         | BrowserbaseGetSessionConnectUrlIntegrationDef
         | RemoteBrowserIntegrationDef
+        | LlamaParseIntegrationDef
         | None
     ) = None
     """

diff --git a/agents-api/agents_api/common/protocol/remote.py b/agents-api/agents_api/common/protocol/remote.py
@@ -93,9 +93,10 @@ def unload_attribute(self, name: str) -> None:
             remote_obj = self.__save_item(data)
             super().__setattr__(name, remote_obj)
 
-    def unload_all(self) -> None:
+    def unload_all(self) -> "BaseRemoteModel":
         for name in list(self._remote_cache.keys()):
             self.unload_attribute(name)
+        return self
 
 
 class RemoteList(list):

diff --git a/agents-api/agents_api/common/storage_handler.py b/agents-api/agents_api/common/storage_handler.py
@@ -4,6 +4,7 @@
 from functools import wraps
 from typing import Any, Callable
 
+from pydantic import BaseModel
 from temporalio import workflow
 
 from ..activities.sync_items_remote import load_inputs_remote
@@ -76,6 +77,31 @@ def load_args(
                                 for key, value in arg.items()
                             }
                         )
+                    elif isinstance(arg, BaseRemoteModel):
+                        new_args.append(arg.unload_all())
+
+                    elif isinstance(arg, BaseModel):
+                        for field in arg.model_fields.keys():
+                            if isinstance(getattr(arg, field), RemoteObject):
+                                setattr(
+                                    arg,
+                                    field,
+                                    load_from_blob_store_if_remote(getattr(arg, field)),
+                                )
+                            elif isinstance(getattr(arg, field), RemoteList):
+                                setattr(
+                                    arg,
+                                    field,
+                                    [
+                                        load_from_blob_store_if_remote(item)
+                                        for item in getattr(arg, field)
+                                    ],
+                                )
+                            elif isinstance(getattr(arg, field), BaseRemoteModel):
+                                setattr(arg, field, getattr(arg, field).unload_all())
+
+                        new_args.append(arg)
+
                     else:
                         new_args.append(arg)
 
@@ -93,6 +119,30 @@ def load_args(
                             for key, value in v.items()
                         }
 
+                    elif isinstance(v, BaseRemoteModel):
+                        new_kwargs[k] = v.unload_all()
+
+                    elif isinstance(v, BaseModel):
+                        for field in v.model_fields.keys():
+                            if isinstance(getattr(v, field), RemoteObject):
+                                setattr(
+                                    v,
+                                    field,
+                                    load_from_blob_store_if_remote(getattr(v, field)),
+                                )
+                            elif isinstance(getattr(v, field), RemoteList):
+                                setattr(
+                                    v,
+                                    field,
+                                    [
+                                        load_from_blob_store_if_remote(item)
+                                        for item in getattr(v, field)
+                                    ],
+                                )
+                            elif isinstance(getattr(v, field), BaseRemoteModel):
+                                setattr(v, field, getattr(v, field).unload_all())
+                        new_kwargs[k] = v
+
                     else:
                         new_kwargs[k] = v
 

diff --git a/cookbooks/01-Website_Crawler_using_Spider.py b/cookbooks/01-Website_Crawler_using_Spider.py
@@ -27,21 +27,34 @@
 )
 
 # Defining a Task
-task_def = yaml.safe_load("""
-name: Agent Crawler
+task_def = yaml.safe_load(f"""
+name: Crawling Task
 
+# Define the tools that the agent will use in this workflow
 tools:
 - name: spider_crawler
   type: integration
   integration:
     provider: spider
     setup:
-      spider_api_key: "{{SPIDER_API_KEY}}"
+      spider_api_key: "{spider_api_key}"
 
+# Define the steps of the workflow
 main:
+# Define a tool call step that calls the spider_crawler tool with the url input
 - tool: spider_crawler
   arguments:
-    url: '"https://spider.cloud"'
+    url: "_['url']" # You can also use 'inputs[0]['url']'
+  
+    
+- prompt: |
+    You are {{{{agent.about}}}}
+    I have given you this url: {{{{inputs[0]['url']}}}}
+    And you have crawled that website. Here are the results you found:
+    {{{{_['documents']}}}}
+    I want you to create a short summary (no longer than 100 words) of the results you found while crawling that website.
+
+  unwrap: True
 """)
 
 # Creating/Updating a task