diff --git a/src/sherpa_ai/actions/link_scraper.py b/src/sherpa_ai/actions/link_scraper.py new file mode 100644 index 00000000..3c04401a --- /dev/null +++ b/src/sherpa_ai/actions/link_scraper.py @@ -0,0 +1,30 @@ +from typing import Any +from sherpa_ai.actions.base import BaseAction +from sherpa_ai.tools import LinkScraperTool +from pydantic import BaseModel, Field + + +class LinkScraperAction(BaseAction, BaseModel): + llm: Any + name: str = "Link Scraper" + args: dict = { + "url": "the url to be scrapped", + } + usage: str = "Simple link scraper that scrapes the data from url and returns string" + scraper_tool: LinkScraperTool = Field(default_factory=LinkScraperTool) + + def execute(self, url: str, **kwargs) -> str: + """ + Executes the scraper tool and returns the scraped data. + Args: + url: The url to be scrapped + Returns: + str: The data scrapped from the url + """ + try: + result = self.scraper_tool._run(url,self.llm) + if not result or len(result) == 0: + raise ValueError("Scraper returned no content.") + return str(result) + except Exception as e: + return "" \ No newline at end of file diff --git a/src/sherpa_ai/tools.py b/src/sherpa_ai/tools.py index 2bc9a90b..abbf6e33 100644 --- a/src/sherpa_ai/tools.py +++ b/src/sherpa_ai/tools.py @@ -3,14 +3,22 @@ from typing import Any, List, Tuple, Union import requests -from langchain_community.utilities import GoogleSerperAPIWrapper -from langchain_core.tools import BaseTool -from langchain_core.vectorstores import VectorStoreRetriever -from loguru import logger -from typing_extensions import Literal +from langchain_community.utilities import GoogleSerperAPIWrapper +from langchain_core.tools import BaseTool +from langchain_core.vectorstores import VectorStoreRetriever +from loguru import logger +from typing_extensions import Literal import sherpa_ai.config as cfg from sherpa_ai.config.task_config import AgentConfig +from sherpa_ai.scrape.extract_github_readme import extract_github_readme +from sherpa_ai.utils import ( + chunk_and_summarize, + count_string_tokens, + get_links_from_text, + rewrite_link_references, + scrape_with_url, +) HTTP_GET_TIMEOUT = 2.5 @@ -297,3 +305,76 @@ def _run(self, query: str) -> str: def _arun(self, query: str) -> str: raise NotImplementedError("UserInputTool does not support async run") + + +class LinkScraperTool(BaseTool): + name = "Link Scraper" + description = "Access the content of a link. Only use this tool when you need to extract information from a link." + llm: Any + + def _run( + self, + query: str, + ) -> str: + + query_links = get_links_from_text(query) + # if there is a link inside the question scrape then summarize based + # on question and then aggregate to the question + + if len(query_links) > 0: + # TODO I should get gpt-3.5-turbo from an environment variable or a config file + available_token = 3000 - count_string_tokens(query, "gpt-3.5-turbo") + per_scrape_token_size = available_token / len(query_links) + final_summary = [] + for last_message_link in query_links: + link = last_message_link["url"] + scraped_data = "" + if "github" in query_links[-1]["base_url"]: + git_scraper = extract_github_readme(link) + if git_scraper: + scraped_data = { + "data": git_scraper, + "status": 200, + } + else: + scraped_data = {"data": "", "status": 404} + else: + scraped_data = scrape_with_url(link) + if scraped_data["status"] == 200: + chunk_summary = chunk_and_summarize( + link=link, + question=query, + text_data=scraped_data["data"], + # TODO_ user id is not going to be needed here in the future + # user_id="", + llm=self.llm, + ) + + while ( + count_string_tokens(chunk_summary, "gpt-3.5-turbo") + > per_scrape_token_size + ): + chunk_summary = chunk_and_summarize( + link=link, + question=query, + text_data=chunk_summary, + # user_id="", + llm=self.llm, + ) + + final_summary.append({"data": chunk_summary, "link": link}) + else: + final_summary.append({"data": "Scraping failed", "link": link}) + + scraped_data = rewrite_link_references(question=query, data=final_summary) + resources = [] + resources.append( + { + "Document": scraped_data, + "Source": ", ".join([link["url"] for link in query_links]), + } + ) + return resources + + def _arun(self, query: str) -> str: + raise NotImplementedError("LinkScraperTool does not support async run") diff --git a/src/sherpa_ai/utils.py b/src/sherpa_ai/utils.py index a1d4c96e..497dfc02 100644 --- a/src/sherpa_ai/utils.py +++ b/src/sherpa_ai/utils.py @@ -641,3 +641,10 @@ def file_text_splitter(data, meta_data): texts = temp_texts return {"texts": texts, "meta_datas": metadatas} + +def get_links_from_text(text: str) -> List[str]: + url_regex = r"(https?://\S+|www\.\S+)" + urls = re.findall(url_regex, text) + + result = [{"url": url, "base_url": get_base_url(url)} for url in urls] + return result diff --git a/src/tests/data/test_link_scraper_action_t_0.jsonl b/src/tests/data/test_link_scraper_action_t_0.jsonl new file mode 100644 index 00000000..1018f3b3 --- /dev/null +++ b/src/tests/data/test_link_scraper_action_t_0.jsonl @@ -0,0 +1,4 @@ +{"input": [{"text": "You are a **question answering assistant** who solves user questions and offers a detailed solution.\\n\\nYour name is QA Agent.\\n\\nGiven your specialized expertise, historical context, and your mission to facilitate Machine-Learning-based solutions, determine which action and its corresponding arguments would be the most scientifically sound and efficient approach to achieve the described task.\\n\\n**Task Description**: what is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/\\n\\n**Possible Actions**:\\n{\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"the url to be scrapped\"\\n },\\n \"usage\": \"Simple link scraper that scrapes the data from url and returns string\"\\n}\\n\\n**Task Context**:\\nwhat is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/\\n\\n\\n**History of Previous Actions**:\\n\\n\\nYou should only select the actions specified in **Possible Actions**\\nYou should only respond in JSON format as described below without any extra text.\\nResponse Format:\\n{\\n \"command\": {\\n \"name\": \"tool/command name you choose\",\\n \"args\": {\\n \"arg name\": \"value\"\\n }\\n }\\n}\\nEnsure the response can be parsed by Python json.loads\\n\\nFollow the described format strictly.\\n\\n", "agent": "human"}], "output": "{\\n \"command\": {\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"https://www.ethiopiancalendar.net/\"\\n }\\n }\\n}", "llm_name": "gpt-3.5-turbo"} +{"input": [{"text": "You are a **question answering assistant** who solves user questions and offers a detailed solution.\\n\\nYour name is QA Agent.\\n\\nGiven your specialized expertise, historical context, and your mission to facilitate Machine-Learning-based solutions, determine which action and its corresponding arguments would be the most scientifically sound and efficient approach to achieve the described task.\\n\\n**Task Description**: what is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/\\n\\n**Possible Actions**:\\n{\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"the url to be scrapped\"\\n },\\n \"usage\": \"Simple link scraper that scrapes the data from url and returns string\"\\n}\\n\\n**Task Context**:\\nwhat is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/\\n\\n\\n**History of Previous Actions**:\\nAction: Link Scraper{'url': 'https://www.ethiopiancalendar.net/'}\\nOutput: \\n\\nYou should only select the actions specified in **Possible Actions**\\nYou should only respond in JSON format as described below without any extra text.\\nResponse Format:\\n{\\n \"command\": {\\n \"name\": \"tool/command name you choose\",\\n \"args\": {\\n \"arg name\": \"value\"\\n }\\n }\\n}\\nEnsure the response can be parsed by Python json.loads\\n\\nFollow the described format strictly.\\n\\n", "agent": "human"}], "output": "{\\n \"command\": {\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"https://www.ethiopiancalendar.net/\"\\n }\\n }\\n}", "llm_name": "gpt-3.5-turbo"} +{"input": [{"text": "You are a **question answering assistant** who solves user questions and offers a detailed solution.\\n\\nYour name is QA Agent.\\n\\nGiven your specialized expertise, historical context, and your mission to facilitate Machine-Learning-based solutions, determine which action and its corresponding arguments would be the most scientifically sound and efficient approach to achieve the described task.\\n\\n**Task Description**: what is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/\\n\\n**Possible Actions**:\\n{\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"the url to be scrapped\"\\n },\\n \"usage\": \"Simple link scraper that scrapes the data from url and returns string\"\\n}\\n\\n**Task Context**:\\nwhat is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/\\n\\n\\n**History of Previous Actions**:\\nAction: Link Scraper{'url': 'https://www.ethiopiancalendar.net/'}\\nOutput: \\nAction: Link Scraper{'url': 'https://www.ethiopiancalendar.net/'}\\nOutput: \\n\\nYou should only select the actions specified in **Possible Actions**\\nYou should only respond in JSON format as described below without any extra text.\\nResponse Format:\\n{\\n \"command\": {\\n \"name\": \"tool/command name you choose\",\\n \"args\": {\\n \"arg name\": \"value\"\\n }\\n }\\n}\\nEnsure the response can be parsed by Python json.loads\\n\\nFollow the described format strictly.\\n\\n", "agent": "human"}], "output": "{\\n \"command\": {\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"https://www.ethiopiancalendar.net/\"\\n }\\n }\\n}", "llm_name": "gpt-3.5-turbo"} +{"input": [{"text": "You are a **question answering assistant** who solves user questions and offers a detailed solution.\\n\\nYour name is QA Agent.\\n\\nContext: what is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/\\n\\n\\nAction - Result History:\\nAction: Link Scraper{'url': 'https://www.ethiopiancalendar.net/'}\\nOutput: \\nAction: Link Scraper{'url': 'https://www.ethiopiancalendar.net/'}\\nOutput: \\nAction: Link Scraper{'url': 'https://www.ethiopiancalendar.net/'}\\nOutput: \\n\\nGiven the context and the action-result history, please complete the task mentioned. Include any links you used from the context and history in the result.\\nTask: what is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/\\nResult:\\n", "agent": "human"}], "output": "The Ethiopian calendar is unique because it follows a different system than the Gregorian calendar commonly used in the Western world. The Ethiopian calendar has 13 months, with 12 months having 30 days each and a 13th month called Pagum\u0113 with 5 or 6 days, depending on whether it is a leap year. This makes the Ethiopian year shorter than the Gregorian year by about 7-8 years.\\n\\nAdditionally, the Ethiopian calendar is about 7-8 years behind the Gregorian calendar. For example, while the year 2021 is recognized in the Gregorian calendar, it is currently the year 2013 in the Ethiopian calendar.\\n\\nThe Ethiopian calendar also has its own unique New Year celebration called Enkutatash, which falls on September 11th (or September 12th in leap years) according to the Gregorian calendar.\\n\\nFor more information, you can visit the official website of the Ethiopian calendar at https://www.ethiopiancalendar.net/.", "llm_name": "gpt-3.5-turbo"} diff --git a/src/tests/data/test_link_scraper_action_t_1.jsonl b/src/tests/data/test_link_scraper_action_t_1.jsonl new file mode 100644 index 00000000..696294ec --- /dev/null +++ b/src/tests/data/test_link_scraper_action_t_1.jsonl @@ -0,0 +1,4 @@ +{"input": [{"text": "You are a **question answering assistant** who solves user questions and offers a detailed solution.\\n\\nYour name is QA Agent.\\n\\nGiven your specialized expertise, historical context, and your mission to facilitate Machine-Learning-based solutions, determine which action and its corresponding arguments would be the most scientifically sound and efficient approach to achieve the described task.\\n\\n**Task Description**: what is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar\\n\\n**Possible Actions**:\\n{\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"the url to be scrapped\"\\n },\\n \"usage\": \"Simple link scraper that scrapes the data from url and returns string\"\\n}\\n\\n**Task Context**:\\nwhat is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar\\n\\n\\n**History of Previous Actions**:\\n\\n\\nYou should only select the actions specified in **Possible Actions**\\nYou should only respond in JSON format as described below without any extra text.\\nResponse Format:\\n{\\n \"command\": {\\n \"name\": \"tool/command name you choose\",\\n \"args\": {\\n \"arg name\": \"value\"\\n }\\n }\\n}\\nEnsure the response can be parsed by Python json.loads\\n\\nFollow the described format strictly.\\n\\n", "agent": "human"}], "output": "{\\n \"command\": {\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"https://en.wikipedia.org/wiki/Ethiopian_calendar\"\\n }\\n }\\n}", "llm_name": "gpt-3.5-turbo"} +{"input": [{"text": "You are a **question answering assistant** who solves user questions and offers a detailed solution.\\n\\nYour name is QA Agent.\\n\\nGiven your specialized expertise, historical context, and your mission to facilitate Machine-Learning-based solutions, determine which action and its corresponding arguments would be the most scientifically sound and efficient approach to achieve the described task.\\n\\n**Task Description**: what is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar\\n\\n**Possible Actions**:\\n{\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"the url to be scrapped\"\\n },\\n \"usage\": \"Simple link scraper that scrapes the data from url and returns string\"\\n}\\n\\n**Task Context**:\\nwhat is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar\\n\\n\\n**History of Previous Actions**:\\nAction: Link Scraper{'url': 'https://en.wikipedia.org/wiki/Ethiopian_calendar'}\\nOutput: \\n\\nYou should only select the actions specified in **Possible Actions**\\nYou should only respond in JSON format as described below without any extra text.\\nResponse Format:\\n{\\n \"command\": {\\n \"name\": \"tool/command name you choose\",\\n \"args\": {\\n \"arg name\": \"value\"\\n }\\n }\\n}\\nEnsure the response can be parsed by Python json.loads\\n\\nFollow the described format strictly.\\n\\n", "agent": "human"}], "output": "{\\n \"command\": {\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"https://en.wikipedia.org/wiki/Ethiopian_calendar\"\\n }\\n }\\n}", "llm_name": "gpt-3.5-turbo"} +{"input": [{"text": "You are a **question answering assistant** who solves user questions and offers a detailed solution.\\n\\nYour name is QA Agent.\\n\\nGiven your specialized expertise, historical context, and your mission to facilitate Machine-Learning-based solutions, determine which action and its corresponding arguments would be the most scientifically sound and efficient approach to achieve the described task.\\n\\n**Task Description**: what is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar\\n\\n**Possible Actions**:\\n{\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"the url to be scrapped\"\\n },\\n \"usage\": \"Simple link scraper that scrapes the data from url and returns string\"\\n}\\n\\n**Task Context**:\\nwhat is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar\\n\\n\\n**History of Previous Actions**:\\nAction: Link Scraper{'url': 'https://en.wikipedia.org/wiki/Ethiopian_calendar'}\\nOutput: \\nAction: Link Scraper{'url': 'https://en.wikipedia.org/wiki/Ethiopian_calendar'}\\nOutput: \\n\\nYou should only select the actions specified in **Possible Actions**\\nYou should only respond in JSON format as described below without any extra text.\\nResponse Format:\\n{\\n \"command\": {\\n \"name\": \"tool/command name you choose\",\\n \"args\": {\\n \"arg name\": \"value\"\\n }\\n }\\n}\\nEnsure the response can be parsed by Python json.loads\\n\\nFollow the described format strictly.\\n\\n", "agent": "human"}], "output": "{\\n \"command\": {\\n \"name\": \"Link Scraper\",\\n \"args\": {\\n \"url\": \"https://en.wikipedia.org/wiki/Ethiopian_calendar\"\\n }\\n }\\n}", "llm_name": "gpt-3.5-turbo"} +{"input": [{"text": "You are a **question answering assistant** who solves user questions and offers a detailed solution.\\n\\nYour name is QA Agent.\\n\\nContext: what is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar\\n\\n\\nAction - Result History:\\nAction: Link Scraper{'url': 'https://en.wikipedia.org/wiki/Ethiopian_calendar'}\\nOutput: \\nAction: Link Scraper{'url': 'https://en.wikipedia.org/wiki/Ethiopian_calendar'}\\nOutput: \\nAction: Link Scraper{'url': 'https://en.wikipedia.org/wiki/Ethiopian_calendar'}\\nOutput: \\n\\nGiven the context and the action-result history, please complete the task mentioned. Include any links you used from the context and history in the result.\\nTask: what is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar\\nResult:\\n", "agent": "human"}], "output": "The Ethiopian calendar is unique in that it follows the ancient Coptic calendar, which is based on the Egyptian calendar. One of the main differences is that the Ethiopian calendar has 13 months, with 12 months having 30 days each and a 13th month called Pagum\u0113 having 5 or 6 days (depending on whether it's a leap year). This results in the Ethiopian calendar being roughly 7-8 years behind the Gregorian calendar. Additionally, the Ethiopian New Year falls on September 11 or 12 (depending on whether it's a leap year), which is different from the January 1st New Year in the Gregorian calendar.\\n\\nSource: [Wikipedia - Ethiopian calendar](https://en.wikipedia.org/wiki/Ethiopian_calendar)", "llm_name": "gpt-3.5-turbo"} diff --git a/src/tests/integration_tests/test_link_scraper_action.py b/src/tests/integration_tests/test_link_scraper_action.py new file mode 100644 index 00000000..fd912e9a --- /dev/null +++ b/src/tests/integration_tests/test_link_scraper_action.py @@ -0,0 +1,94 @@ +from unittest.mock import patch + +import pytest +from loguru import logger + +from sherpa_ai.actions.link_scraper import LinkScraperAction +from sherpa_ai.actions.google_search import GoogleSearch +import sherpa_ai.config as cfg +from sherpa_ai.agents.qa_agent import QAAgent +from sherpa_ai.events import EventType +from sherpa_ai.memory import SharedMemory +from sherpa_ai.memory.belief import Belief +from sherpa_ai.models.sherpa_base_chat_model import SherpaChatOpenAI +from sherpa_ai.test_utils.llms import get_llm +from sherpa_ai.tools import LinkScraperTool +from sherpa_ai.utils import extract_entities + + +@pytest.mark.parametrize( + "test_id, objective, input_data", + [ + ( + 0, + "what is unique about Ethiopia calendar? based on these links https://www.ethiopiancalendar.net/", + { + "Document": "The Ethiopia calendar is similar to the Coptic Egyptian calendar since both have 13 months, 12 of which have 30 days and an intercalary month at the end of the year called Pagume which means 'forgotten days' in Greek. This last month has five days or six days in a leap year.", + "Source": "https://www.ethiopiancalendar.net/" + }, + ), + ( + 1, + "what is unique about Ethiopia calendar? based on this link https://en.wikipedia.org/wiki/Ethiopian_calendar", + { + "Document": "The Ethiopian calendar is a solar calendar that is derived from the Egyptian calendar, but with some differences. It has 13 months: 12 of 30 days each and an additional month at the end of the year with five or six days.", + "Source": "https://en.wikipedia.org/wiki/Ethiopian_calendar" + }, + ) + ], +) +def test_link_scraper_succeeds_in_qa( + get_llm, test_id, objective, input_data, +): + def mock_run(url): + if url == "https://www.ethiopiancalendar.net/": + return { + "Document": "The Ethiopia calendar is similar to the Coptic Egyptian calendar since both have 13 months, 12 of which have 30 days and an intercalary month at the end of the year called Pagume which means 'forgotten days' in Greek. This last month has five days or six days in a leap year.", + "Source": "https://www.ethiopiancalendar.net/" + } + elif url == "https://en.wikipedia.org/wiki/Ethiopian_calendar": + return { + "Document": "The Ethiopian calendar is a solar calendar that is derived from the Egyptian calendar, but with some differences. It has 13 months: 12 of 30 days each and an additional month at the end of the year with five or six days.", + "Source": "https://en.wikipedia.org/wiki/Ethiopian_calendar" + } + else: + return { + "Document": "Unknown content", + "Source": url + } + llm = get_llm( + __file__, test_link_scraper_succeeds_in_qa.__name__[0] + f"_{str(test_id)}" + ) + + belief = Belief() + shared_memory = SharedMemory( + objective=objective, + agent_pool=None, + ) + + link_scraper_action = LinkScraperAction(llm=llm) + + with patch.object(LinkScraperTool, "_run", side_effect=mock_run): + task_agent = QAAgent( + llm=llm, + shared_memory=shared_memory, + num_runs=3, + actions=[link_scraper_action], + validation_steps=3, + belief=belief, + ) + + shared_memory.add( + EventType.task, + "Scraper", + objective, + ) + + task_agent.run() + + results = shared_memory.get_by_type(EventType.result) + logger.info(results[0].content) + assert any(result[0] in results[0].content and + result[1] in results[0].content + for result in input_data), "Result not found in input_data" +