Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
GareArc committed Dec 25, 2024
1 parent 8df27e0 commit 9973224
Show file tree
Hide file tree
Showing 23 changed files with 201 additions and 202 deletions.
69 changes: 39 additions & 30 deletions api/controllers/console/datasets/datasets_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,49 +3,58 @@
from datetime import UTC, datetime
from typing import cast

from flask import request
from flask_login import current_user # type: ignore
from flask_restful import Resource, fields, marshal, marshal_with, reqparse # type: ignore
from sqlalchemy import asc, desc
from transformers.hf_argparser import string_to_bool # type: ignore
from werkzeug.exceptions import Forbidden, NotFound

import services
from controllers.console import api
from controllers.console.app.error import (
ProviderModelCurrentlyNotSupportError, ProviderNotInitializeError,
ProviderQuotaExceededError)
from controllers.console.datasets.error import (ArchivedDocumentImmutableError,
DocumentAlreadyFinishedError,
DocumentIndexingError,
IndexingEstimateError,
InvalidActionError,
InvalidMetadataError)
from controllers.console.wraps import (account_initialization_required,
cloud_edition_billing_resource_check,
setup_required)
from core.errors.error import (LLMBadRequestError,
ModelCurrentlyNotSupportError,
ProviderTokenNotInitError, QuotaExceededError)
ProviderModelCurrentlyNotSupportError,
ProviderNotInitializeError,
ProviderQuotaExceededError,
)
from controllers.console.datasets.error import (
ArchivedDocumentImmutableError,
DocumentAlreadyFinishedError,
DocumentIndexingError,
IndexingEstimateError,
InvalidActionError,
InvalidMetadataError,
)
from controllers.console.wraps import (
account_initialization_required,
cloud_edition_billing_resource_check,
setup_required,
)
from core.errors.error import (
LLMBadRequestError,
ModelCurrentlyNotSupportError,
ProviderTokenNotInitError,
QuotaExceededError,
)
from core.indexing_runner import IndexingRunner
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from core.model_runtime.errors.invoke import InvokeAuthorizationError
from core.rag.extractor.entity.extract_setting import ExtractSetting
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from fields.document_fields import (dataset_and_document_fields,
document_fields, document_status_fields,
document_with_segments_fields)
from flask import request
from flask_login import current_user # type: ignore
from flask_restful import (Resource, fields, marshal, # type: ignore
marshal_with, reqparse)
from fields.document_fields import (
dataset_and_document_fields,
document_fields,
document_status_fields,
document_with_segments_fields,
)
from libs.login import login_required
from models import (Dataset, DatasetProcessRule, Document, DocumentSegment,
UploadFile)
from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
from services.dataset_service import DatasetService, DocumentService
from services.entities.knowledge_entities.knowledge_entities import \
KnowledgeConfig
from sqlalchemy import asc, desc
from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
from tasks.add_document_to_index_task import add_document_to_index_task
from tasks.remove_document_from_index_task import \
remove_document_from_index_task
from transformers.hf_argparser import string_to_bool # type: ignore
from werkzeug.exceptions import Forbidden, NotFound
from tasks.remove_document_from_index_task import remove_document_from_index_task


class DocumentResource(Resource):
Expand Down
24 changes: 12 additions & 12 deletions api/controllers/console/workspace/account.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
import datetime

import pytz
from flask import request
from flask_login import current_user # type: ignore
from flask_restful import Resource, fields, marshal_with, reqparse # type: ignore

from configs import dify_config
from constants.languages import supported_language
from controllers.console import api
from controllers.console.workspace.error import (
AccountAlreadyInitedError, CurrentPasswordIncorrectError,
InvalidAccountDeletionCodeError, InvalidInvitationCodeError,
RepeatPasswordNotMatchError)
from controllers.console.wraps import (account_initialization_required,
enterprise_license_required,
setup_required)
AccountAlreadyInitedError,
CurrentPasswordIncorrectError,
InvalidAccountDeletionCodeError,
InvalidInvitationCodeError,
RepeatPasswordNotMatchError,
)
from controllers.console.wraps import account_initialization_required, enterprise_license_required, setup_required
from extensions.ext_database import db
from fields.member_fields import account_fields
from flask import request
from flask_login import current_user # type: ignore
from flask_restful import (Resource, fields, marshal_with, # type: ignore
reqparse)
from libs.helper import TimestampField, timezone
from libs.login import login_required
from models import AccountIntegrate, InvitationCode
from services.account_service import AccountService
from services.billing_service import BillingService
from services.errors.account import \
CurrentPasswordIncorrectError as ServiceCurrentPasswordIncorrectError
from services.errors.account import CurrentPasswordIncorrectError as ServiceCurrentPasswordIncorrectError


class AccountInitApi(Resource):
Expand Down
12 changes: 7 additions & 5 deletions api/core/file/file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

from configs import dify_config
from core.helper import ssrf_proxy
from core.model_runtime.entities import (AudioPromptMessageContent,
DocumentPromptMessageContent,
ImagePromptMessageContent,
MultiModalPromptMessageContent,
VideoPromptMessageContent)
from core.model_runtime.entities import (
AudioPromptMessageContent,
DocumentPromptMessageContent,
ImagePromptMessageContent,
MultiModalPromptMessageContent,
VideoPromptMessageContent,
)
from extensions.ext_storage import storage

from . import helpers
Expand Down
34 changes: 16 additions & 18 deletions api/core/indexing_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
import uuid
from typing import Any, Optional, cast

from flask import current_app
from flask_login import current_user
from sqlalchemy.orm.exc import ObjectDeletedError

from configs import dify_config
from core.entities.knowledge_entities import (IndexingEstimate, PreviewDetail,
QAPreviewDetail)
from core.entities.knowledge_entities import IndexingEstimate, PreviewDetail, QAPreviewDetail
from core.errors.error import ProviderTokenNotInitError
from core.model_manager import ModelInstance, ModelManager
from core.model_runtime.entities.model_entities import ModelType
Expand All @@ -20,25 +23,22 @@
from core.rag.extractor.entity.extract_setting import ExtractSetting
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.index_processor.index_processor_base import BaseIndexProcessor
from core.rag.index_processor.index_processor_factory import \
IndexProcessorFactory
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import ChildDocument, Document
from core.rag.splitter.fixed_text_splitter import (
EnhanceRecursiveCharacterTextSplitter, FixedRecursiveCharacterTextSplitter)
EnhanceRecursiveCharacterTextSplitter,
FixedRecursiveCharacterTextSplitter,
)
from core.rag.splitter.text_splitter import TextSplitter
from core.tools.utils.web_reader_tool import get_image_upload_file_ids
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from flask import current_app
from flask_login import current_user
from libs import helper
from models.dataset import ChildChunk, Dataset, DatasetProcessRule
from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment
from models.dataset import Document as DatasetDocument
from models.dataset import DocumentSegment
from models.model import UploadFile
from services.feature_service import FeatureService
from sqlalchemy.orm.exc import ObjectDeletedError


class IndexingRunner:
Expand Down Expand Up @@ -293,15 +293,15 @@ def indexing_estimate(
process_rule=processing_rule.to_dict(),
tenant_id=current_user.current_tenant_id,
doc_language=doc_language,
preview=True
preview=True,
)
total_segments += len(documents)
for document in documents:
if len(preview_texts) < 10:
if doc_form and doc_form == "qa_model":
preview_detail = QAPreviewDetail(question=document.page_content,
answer=document.metadata.get("answer")
)
preview_detail = QAPreviewDetail(
question=document.page_content, answer=document.metadata.get("answer")
)
preview_texts.append(preview_detail)
else:
preview_detail = PreviewDetail(content=document.page_content)
Expand All @@ -324,9 +324,7 @@ def indexing_estimate(
db.session.delete(image_file)

if doc_form and doc_form == "qa_model":
return IndexingEstimate(
total_segments=total_segments * 20, qa_preview=preview_texts, preview=[]
)
return IndexingEstimate(total_segments=total_segments * 20, qa_preview=preview_texts, preview=[])
return IndexingEstimate(total_segments=total_segments, preview=preview_texts)

def _extract(
Expand Down Expand Up @@ -545,7 +543,7 @@ def _load(
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for i in range(0, len(documents), chunk_size):
chunk_documents = documents[i: i + chunk_size]
chunk_documents = documents[i : i + chunk_size]
futures.append(
executor.submit(
self._process_chunk,
Expand Down
6 changes: 3 additions & 3 deletions api/core/rag/datasource/retrieval_service.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import threading
from typing import Optional

from flask import Flask, current_app

from core.rag.data_post_processor.data_post_processor import DataPostProcessor
from core.rag.datasource.keyword.keyword_factory import Keyword
from core.rag.datasource.vdb.vector_factory import Vector
Expand All @@ -10,10 +12,8 @@
from core.rag.rerank.rerank_type import RerankMode
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from extensions.ext_database import db
from flask import Flask, current_app
from models.dataset import ChildChunk, Dataset
from models.dataset import ChildChunk, Dataset, DocumentSegment
from models.dataset import Document as DatasetDocument
from models.dataset import DocumentSegment
from services.external_knowledge_service import ExternalDatasetService

default_retrieval_model = {
Expand Down
3 changes: 2 additions & 1 deletion api/core/rag/docstore/dataset_docstore.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from collections.abc import Sequence
from typing import Any, Optional

from sqlalchemy import func

from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from core.rag.models.document import Document
from extensions.ext_database import db
from models.dataset import ChildChunk, Dataset, DocumentSegment
from sqlalchemy import func


class DatasetDocumentStore:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
from extensions.ext_database import db
from libs import helper
from models.dataset import ChildChunk, Dataset, DocumentSegment
from services.entities.knowledge_entities.knowledge_entities import (
ParentMode, Rule)
from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule


class ParentChildIndexProcessor(BaseIndexProcessor):
Expand Down Expand Up @@ -124,9 +123,7 @@ def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords:
vector.delete()

if delete_child_chunks:
db.session.query(ChildChunk).filter(
ChildChunk.dataset_id == dataset.id
).delete()
db.session.query(ChildChunk).filter(ChildChunk.dataset_id == dataset.id).delete()
db.session.commit()

def retrieve(
Expand Down
19 changes: 11 additions & 8 deletions api/core/rag/index_processor/processor/qa_index_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from typing import Optional

import pandas as pd
from flask import Flask, current_app
from werkzeug.datastructures import FileStorage

from core.llm_generator.llm_generator import LLMGenerator
from core.rag.cleaner.clean_processor import CleanProcessor
from core.rag.datasource.retrieval_service import RetrievalService
Expand All @@ -16,11 +19,9 @@
from core.rag.index_processor.index_processor_base import BaseIndexProcessor
from core.rag.models.document import Document
from core.tools.utils.text_processing_utils import remove_leading_symbols
from flask import Flask, current_app
from libs import helper
from models.dataset import Dataset
from services.entities.knowledge_entities.knowledge_entities import Rule
from werkzeug.datastructures import FileStorage


class QAIndexProcessor(BaseIndexProcessor):
Expand Down Expand Up @@ -69,15 +70,17 @@ def transform(self, documents: list[Document], **kwargs) -> list[Document]:
split_documents.append(document_node)
all_documents.extend(split_documents)
if preview:
self._format_qa_document(current_app._get_current_object(),
kwargs.get("tenant_id"),
all_documents[0],
all_qa_documents,
kwargs.get("doc_language", "English"))
self._format_qa_document(
current_app._get_current_object(),
kwargs.get("tenant_id"),
all_documents[0],
all_qa_documents,
kwargs.get("doc_language", "English"),
)
else:
for i in range(0, len(all_documents), 10):
threads = []
sub_documents = all_documents[i: i + 10]
sub_documents = all_documents[i : i + 10]
for doc in sub_documents:
document_format_thread = threading.Thread(
target=self._format_qa_document,
Expand Down
15 changes: 1 addition & 14 deletions api/core/rag/models/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,7 @@
from collections.abc import Sequence
from typing import Any, Optional

from pydantic import BaseModel


class ChildDocument(BaseModel):
"""Class for storing a piece of text and associated metadata."""

page_content: str

vector: Optional[list[float]] = None

"""Arbitrary metadata about the page content (e.g., source, relationships to other
documents, etc.).
"""
metadata: dict = {}
from pydantic import BaseModel, Field


class ChildDocument(BaseModel):
Expand Down
Loading

0 comments on commit 9973224

Please sign in to comment.