update clean embedding cache query logic (#6483)

This commit is contained in:
Jyong 2024-07-20 01:29:25 +08:00 committed by GitHub
parent 27e08a8e2e
commit 1e0e573165
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 46 additions and 6 deletions

View File

@ -0,0 +1,32 @@
"""add-embedding-cache-created_at_index
Revision ID: 6e957a32015b
Revises: fecff1c3da27
Create Date: 2024-07-19 17:21:34.414705
"""
from alembic import op
import models as models
# revision identifiers, used by Alembic.
revision = '6e957a32015b'
down_revision = 'fecff1c3da27'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('embeddings', schema=None) as batch_op:
batch_op.create_index('created_at_idx', ['created_at'], unique=False)
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('embeddings', schema=None) as batch_op:
batch_op.drop_index('created_at_idx')
# ### end Alembic commands ###

View File

@ -630,7 +630,8 @@ class Embedding(db.Model):
__tablename__ = 'embeddings'
__table_args__ = (
db.PrimaryKeyConstraint('id', name='embedding_pkey'),
db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx')
db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx'),
db.Index('created_at_idx', 'created_at')
)
id = db.Column(StringUUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))

View File

@ -1383,7 +1383,7 @@ class TraceAppConfig(db.Model):
__tablename__ = 'trace_app_config'
__table_args__ = (
db.PrimaryKeyConstraint('id', name='tracing_app_config_pkey'),
db.Index('tracing_app_config_app_id_idx', 'app_id'),
db.Index('trace_app_config_app_id_idx', 'app_id'),
)
id = db.Column(StringUUID, server_default=db.text('uuid_generate_v4()'))

View File

@ -2,6 +2,7 @@ import datetime
import time
import click
from sqlalchemy import text
from werkzeug.exceptions import NotFound
import app
@ -18,12 +19,18 @@ def clean_embedding_cache_task():
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=clean_days)
while True:
try:
embeddings = db.session.query(Embedding).filter(Embedding.created_at < thirty_days_ago) \
embedding_ids = db.session.query(Embedding.id).filter(Embedding.created_at < thirty_days_ago) \
.order_by(Embedding.created_at.desc()).limit(100).all()
embedding_ids = [embedding_id[0] for embedding_id in embedding_ids]
except NotFound:
break
for embedding in embeddings:
db.session.delete(embedding)
db.session.commit()
if embedding_ids:
db.session.execute(text(
"DELETE FROM embeddings WHERE id in :embedding_ids"
), {'embedding_ids': tuple(embedding_ids)})
db.session.commit()
else:
break
end_at = time.perf_counter()
click.echo(click.style('Cleaned embedding cache from db success latency: {}'.format(end_at - start_at), fg='green'))