Skip to content

Commit

Permalink
Merge pull request #873 from AdarshJha619/read_file
Browse files Browse the repository at this point in the history
  • Loading branch information
nborthy authored Jul 29, 2023
2 parents 532bb66 + a64405c commit aa4b70b
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 6 deletions.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,5 @@ EbookLib==0.18
html2text==2020.1.16
duckduckgo-search==3.8.3
google-generativeai==0.1.0
unstructured==0.8.1
beautifulsoup4==4.12.2
29 changes: 24 additions & 5 deletions superagi/tools/file/read_file.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@

import os
from typing import Type, Optional
import ebooklib
import bs4
from bs4 import BeautifulSoup

from pydantic import BaseModel, Field
from ebooklib import epub

from superagi.helper.resource_helper import ResourceHelper
from superagi.helper.s3_helper import S3Helper
Expand All @@ -11,7 +16,7 @@
from superagi.models.agent import Agent
from superagi.types.storage_types import StorageType
from superagi.config.config import get_config

from unstructured.partition.auto import partition

class ReadFileSchema(BaseModel):
"""Input for CopyFileTool."""
Expand Down Expand Up @@ -57,9 +62,23 @@ def _execute(self, file_name: str):
raise FileNotFoundError(f"File '{file_name}' not found.")
directory = os.path.dirname(final_path)
os.makedirs(directory, exist_ok=True)

# Check if the file is an .epub file
if final_path.lower().endswith('.epub'):
# Use ebooklib to read the epub file
book = epub.read_epub(final_path)
# Get the text content from each item in the book
content = []
for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
soup = BeautifulSoup(item.get_content(), 'html.parser')
content.append(soup.get_text())

content = "\n".join(content)
else:
elements = partition(final_path)
content = "\n\n".join([str(el) for el in elements])

return content


with open(final_path, 'r') as file:
file_content = file.read()
max_length = len(' '.join(file_content.split(" ")[:1000]))
return file_content[:max_length] + "\n File " + file_name + " read successfully."

2 changes: 1 addition & 1 deletion tools.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"tools": {
}
}
}

0 comments on commit aa4b70b

Please sign in to comment.