Merge pull request #873 from AdarshJha619/read_file

TransformerOptimus · Jul 29, 2023 · aa4b70b · aa4b70b
2 parents 532bb66 + a64405c
commit aa4b70b
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 6 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -153,3 +153,5 @@ EbookLib==0.18
 html2text==2020.1.16
 duckduckgo-search==3.8.3 
 google-generativeai==0.1.0
+unstructured==0.8.1
+beautifulsoup4==4.12.2
diff --git a/superagi/tools/file/read_file.py b/superagi/tools/file/read_file.py
@@ -1,7 +1,12 @@
+
 import os
 from typing import Type, Optional
+import ebooklib
+import bs4 
+from bs4 import BeautifulSoup
 
 from pydantic import BaseModel, Field
+from ebooklib import epub
 
 from superagi.helper.resource_helper import ResourceHelper
 from superagi.helper.s3_helper import S3Helper
@@ -11,7 +16,7 @@
 from superagi.models.agent import Agent
 from superagi.types.storage_types import StorageType
 from superagi.config.config import get_config
-
+from unstructured.partition.auto import partition
 
 class ReadFileSchema(BaseModel):
     """Input for CopyFileTool."""
@@ -57,9 +62,23 @@ def _execute(self, file_name: str):
             raise FileNotFoundError(f"File '{file_name}' not found.")
         directory = os.path.dirname(final_path)
         os.makedirs(directory, exist_ok=True)
+
+        # Check if the file is an .epub file
+        if final_path.lower().endswith('.epub'):
+            # Use ebooklib to read the epub file
+            book = epub.read_epub(final_path)
+            # Get the text content from each item in the book
+            content = []
+            for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+                soup = BeautifulSoup(item.get_content(), 'html.parser')
+                content.append(soup.get_text())
+
+            content = "\n".join(content)
+        else:
+            elements = partition(final_path)
+            content = "\n\n".join([str(el) for el in elements])
+
+        return content
+
 
-        with open(final_path, 'r') as file:
-            file_content = file.read()
-        max_length = len(' '.join(file_content.split(" ")[:1000]))
-        return file_content[:max_length] + "\n File " + file_name + " read successfully."
 
diff --git a/tools.json b/tools.json
@@ -1,4 +1,4 @@
 {
   "tools": {
   }
-}
+}