Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filesystem abstraction #251

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions examples/python_api/fsspec_demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from ragna import Rag, assistants, source_storages\n",
"\n",
"rag = Rag()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from ragna.core import filesystem_glob\n",
"\n",
"globs = filesystem_glob(\"github://nenb/Notes/programming/ADR/**.txt\")\n",
"# ugly, but it works for now\n",
"documents = [\"github://nenb/Notes/\" + glob for glob in globs]\n",
"\n",
"chat = rag.chat(\n",
" documents=documents,\n",
" source_storage=source_storages.LanceDB,\n",
" assistant=assistants.Gpt35Turbo16k,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Message(content='How can I help you with the documents?', role=<MessageRole.SYSTEM: 'system'>, sources=[])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"await chat.prepare()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SQS was chosen for task queues due to several factors. One of the deciding factors was the simplicity of the client interface, including the built-in support for Dead Letter Queues. Additionally, SQS can be easily replicated across regions if necessary in the future. Another consideration was the cost-effectiveness of using SQS compared to other options like Kafka, especially when replicated across regions. Overall, SQS provided a simple and reliable solution for managing task queues in the system.\n"
]
}
],
"source": [
"print(await chat.answer(\"Why was SQS chosen for task queues?\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "ragna-dev",
"language": "python",
"name": "ragna-dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
192 changes: 192 additions & 0 deletions examples/rest_api/fsspec_demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Response [200]>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import requests\n",
"\n",
"# smoke-test -> have you started the API?\n",
"requests.get(\"http://localhost:31476/docs\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"t = requests.post(\n",
" \"http://localhost:31476/token\", data={\"username\": \"nenb\", \"password\": \"nenb\"}\n",
").json()\n",
"\n",
"headers = {\"Authorization\": f\"Bearer {t}\"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'filesystems': ['github', 'local'],\n",
" 'documents': ['.pdf', '.txt'],\n",
" 'source_storages': [{'properties': {'chunk_overlap': {'default': 250,\n",
" 'title': 'Chunk Overlap',\n",
" 'type': 'integer'},\n",
" 'chunk_size': {'default': 500, 'title': 'Chunk Size', 'type': 'integer'},\n",
" 'num_tokens': {'default': 1024, 'title': 'Num Tokens', 'type': 'integer'}},\n",
" 'required': [],\n",
" 'title': 'Chroma',\n",
" 'type': 'object'},\n",
" {'properties': {'chunk_overlap': {'default': 250,\n",
" 'title': 'Chunk Overlap',\n",
" 'type': 'integer'},\n",
" 'chunk_size': {'default': 500, 'title': 'Chunk Size', 'type': 'integer'},\n",
" 'num_tokens': {'default': 1024, 'title': 'Num Tokens', 'type': 'integer'}},\n",
" 'required': [],\n",
" 'title': 'LanceDB',\n",
" 'type': 'object'}],\n",
" 'assistants': [{'properties': {'max_new_tokens': {'default': 256,\n",
" 'title': 'Max New Tokens',\n",
" 'type': 'integer'}},\n",
" 'title': 'OpenAI/gpt-4',\n",
" 'type': 'object'},\n",
" {'properties': {'max_new_tokens': {'default': 256,\n",
" 'title': 'Max New Tokens',\n",
" 'type': 'integer'}},\n",
" 'title': 'OpenAI/gpt-3.5-turbo-16k',\n",
" 'type': 'object'}]}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# there is a new filesystems component\n",
"requests.get(\"http://localhost:31476/components\", headers=headers).json()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"name = \"bm25.pdf\"\n",
"prefixed_path = (\n",
" \"github://papers-we-love/papers-we-love/information_retrieval/okapi-at-trec3.pdf\"\n",
")\n",
"\n",
"# this creates the relevant metadata on the server and stores in the db\n",
"metadata = requests.get(\n",
" f\"http://localhost:31476/document?name={name}&prefixed_path={prefixed_path}\",\n",
" headers=headers,\n",
").json()\n",
"docs = metadata[\"document\"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"data = json.dumps(\n",
" {\n",
" \"name\": \"BM25\",\n",
" \"source_storage\": \"LanceDB\",\n",
" \"assistant\": \"OpenAI/gpt-3.5-turbo-16k\",\n",
" \"params\": {},\n",
" \"documents\": [docs],\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# create a new chat\n",
"chat_metadata = requests.post(\n",
" \"http://localhost:31476/chats\", headers=headers, data=data\n",
").json()\n",
"chat_id = chat_metadata[\"id\"]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# this retrieves the pdf from github, and stores the embeddings on the server\n",
"requests.post(f\"http://localhost:31476/chats/{chat_id}/prepare\", headers=headers).json()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The new developments for TREC-3 included the introduction of the simple inverse collection frequency (ICF) term-weighting scheme, which incorporated within-document frequency, document length, and within-query frequency components. Additionally, there were advancements in automatic ad hoc and routing results, as well as the development of a user interface and search framework. Query expansion and routing term selection were also successful developments. Modified term-weighting functions and passage retrieval had small beneficial effects.\n"
]
}
],
"source": [
"prompt = \"What were the new developments for TREC-3?\"\n",
"response = requests.post(\n",
" f\"http://localhost:31476/chats/{chat_id}/answer?prompt={prompt}\", headers=headers\n",
").json()\n",
"\n",
"print(response[\"message\"][\"content\"])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ragna-dev",
"language": "python",
"name": "ragna-dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies = [
"anyio",
"emoji",
"fastapi",
"fsspec",
"httpx",
"importlib_metadata>=4.6; python_version<'3.10'",
"packaging",
Expand Down Expand Up @@ -136,6 +137,7 @@ disallow_incomplete_defs = false
[[tool.mypy.overrides]]
module = [
"fitz",
"fsspec",
"lancedb",
"param",
"pyarrow",
Expand Down
13 changes: 5 additions & 8 deletions ragna/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
"Document",
"DocumentHandler",
"EnvVarRequirement",
"LocalDocument",
"FilesystemDocument",
"filesystem_glob",
"Message",
"MessageRole",
"PackageRequirement",
Expand All @@ -19,22 +20,18 @@
"TxtDocumentHandler",
]

from ._utils import (
EnvVarRequirement,
PackageRequirement,
RagnaException,
Requirement,
)
from ._utils import EnvVarRequirement, PackageRequirement, RagnaException, Requirement

# isort: split

from ._document import (
Document,
DocumentHandler,
LocalDocument,
FilesystemDocument,
Page,
PdfDocumentHandler,
TxtDocumentHandler,
filesystem_glob,
)

# isort: split
Expand Down
Loading