dify/api/tasks/document_indexing_task.py

import datetime
import logging
import time

import click
from celery import shared_task
from flask import current_app

from core.indexing_runner import DocumentIsPausedException, IndexingRunner
from extensions.ext_database import db
from models.dataset import Dataset, Document
from services.feature_service import FeatureService


@shared_task(queue='dataset')
def document_indexing_task(dataset_id: str, document_ids: list):
    """
    Async process document
    :param dataset_id:
    :param document_ids:

    Usage: document_indexing_task.delay(dataset_id, document_id)
    """
    documents = []
    start_at = time.perf_counter()

    dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()

    # check document limit
    features = FeatureService.get_features(dataset.tenant_id)
    try:
        if features.billing.enabled:
            vector_space = features.vector_space
            count = len(document_ids)
            batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
            if count > batch_upload_limit:
                raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
            if 0 < vector_space.limit <= vector_space.size:
                raise ValueError("Your total number of documents plus the number of uploads have over the limit of "
                                 "your subscription.")
    except Exception as e:
        for document_id in document_ids:
            document = db.session.query(Document).filter(
                Document.id == document_id,
                Document.dataset_id == dataset_id
            ).first()
            if document:
                document.indexing_status = 'error'
                document.error = str(e)
                document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
                db.session.add(document)
        db.session.commit()
        return

    for document_id in document_ids:
        logging.info(click.style('Start process document: {}'.format(document_id), fg='green'))

        document = db.session.query(Document).filter(
            Document.id == document_id,
            Document.dataset_id == dataset_id
        ).first()

        if document:
            document.indexing_status = 'parsing'
            document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
            documents.append(document)
            db.session.add(document)
    db.session.commit()

    try:
        indexing_runner = IndexingRunner()
        indexing_runner.run(documents)
        end_at = time.perf_counter()
        logging.info(click.style('Processed dataset: {} latency: {}'.format(dataset_id, end_at - start_at), fg='green'))
    except DocumentIsPausedException as ex:
        logging.info(click.style(str(ex), fg='yellow'))
    except Exception:
        pass
Initial commit 2023-05-15 08:51:32 +08:00			`import datetime`
			`import logging`
			`import time`

			`import click`
			`from celery import shared_task`
Fix/upload limit (#2521) Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2024-02-22 17:16:22 +08:00			`from flask import current_app`
enhancement: introduce Ruff for Python linter for reordering and removing unused imports with automated pre-commit and sytle check (#2366) 2024-02-06 13:21:13 +08:00
improve: introduce isort for linting Python imports (#1983) 2024-01-12 12:34:01 +08:00			`from core.indexing_runner import DocumentIsPausedException, IndexingRunner`
Initial commit 2023-05-15 08:51:32 +08:00			`from extensions.ext_database import db`
Fix/upload limit (#2521) Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2024-02-22 17:16:22 +08:00			`from models.dataset import Dataset, Document`
			`from services.feature_service import FeatureService`
Initial commit 2023-05-15 08:51:32 +08:00

feat: add queue to celery task (#688) 2023-07-31 13:13:08 +08:00			`@shared_task(queue='dataset')`
Feat/dataset notion import (#392) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2023-06-16 21:47:51 +08:00			`def document_indexing_task(dataset_id: str, document_ids: list):`
Initial commit 2023-05-15 08:51:32 +08:00			`"""`
			`Async process document`
			`:param dataset_id:`
Feat/dataset notion import (#392) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2023-06-16 21:47:51 +08:00			`:param document_ids:`
Initial commit 2023-05-15 08:51:32 +08:00
			`Usage: document_indexing_task.delay(dataset_id, document_id)`
			`"""`
Feat/dataset notion import (#392) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2023-06-16 21:47:51 +08:00			`documents = []`
feat: upgrade langchain (#430) Co-authored-by: jyong <718720800@qq.com> 2023-06-25 16:49:14 +08:00			`start_at = time.perf_counter()`
Fix/upload limit (#2521) Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2024-02-22 17:16:22 +08:00
			`dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()`

			`# check document limit`
			`features = FeatureService.get_features(dataset.tenant_id)`
			`try:`
			`if features.billing.enabled:`
			`vector_space = features.vector_space`
			`count = len(document_ids)`
			`batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])`
			`if count > batch_upload_limit:`
			`raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")`
			`if 0 < vector_space.limit <= vector_space.size:`
			`raise ValueError("Your total number of documents plus the number of uploads have over the limit of "`
			`"your subscription.")`
			`except Exception as e:`
			`for document_id in document_ids:`
			`document = db.session.query(Document).filter(`
			`Document.id == document_id,`
			`Document.dataset_id == dataset_id`
			`).first()`
			`if document:`
			`document.indexing_status = 'error'`
			`document.error = str(e)`
feat: Deprecate datetime.utcnow() in favor of datetime.now(timezone.utc).replace(tzinfo=None) for better timezone handling (#3408) (#3416) 2024-04-12 16:22:24 +08:00			`document.stopped_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)`
Fix/upload limit (#2521) Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2024-02-22 17:16:22 +08:00			`db.session.add(document)`
			`db.session.commit()`
			`return`

Feat/dataset notion import (#392) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2023-06-16 21:47:51 +08:00			`for document_id in document_ids:`
			`logging.info(click.style('Start process document: {}'.format(document_id), fg='green'))`

			`document = db.session.query(Document).filter(`
			`Document.id == document_id,`
			`Document.dataset_id == dataset_id`
			`).first()`

Feat/improve document delete logic (#1325) Co-authored-by: jyong <jyong@dify.ai> 2023-10-12 13:30:44 +08:00			`if document:`
			`document.indexing_status = 'parsing'`
feat: Deprecate datetime.utcnow() in favor of datetime.now(timezone.utc).replace(tzinfo=None) for better timezone handling (#3408) (#3416) 2024-04-12 16:22:24 +08:00			`document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)`
Feat/improve document delete logic (#1325) Co-authored-by: jyong <jyong@dify.ai> 2023-10-12 13:30:44 +08:00			`documents.append(document)`
			`db.session.add(document)`
Initial commit 2023-05-15 08:51:32 +08:00			`db.session.commit()`

			`try:`
			`indexing_runner = IndexingRunner()`
Feat/dataset notion import (#392) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2023-06-16 21:47:51 +08:00			`indexing_runner.run(documents)`
Initial commit 2023-05-15 08:51:32 +08:00			`end_at = time.perf_counter()`
feat: upgrade langchain (#430) Co-authored-by: jyong <718720800@qq.com> 2023-06-25 16:49:14 +08:00			`logging.info(click.style('Processed dataset: {} latency: {}'.format(dataset_id, end_at - start_at), fg='green'))`
			`except DocumentIsPausedException as ex:`
			`logging.info(click.style(str(ex), fg='yellow'))`
			`except Exception:`
			`pass`