mirror of
https://github.com/idrainformatica/PecFlow.git
synced 2026-06-16 12:45:42 +02:00
Semantic search
This commit is contained in:
@@ -0,0 +1,76 @@
|
||||
"""add full text search vector to messages and extracted_text to attachments
|
||||
|
||||
Revision ID: 0008
|
||||
Revises: 0007
|
||||
Create Date: 2026-03-25
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision = '0008'
|
||||
down_revision = '0007'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# 1. Aggiunge colonna search_vector a messages
|
||||
op.add_column(
|
||||
'messages',
|
||||
sa.Column('search_vector', postgresql.TSVECTOR(), nullable=True),
|
||||
)
|
||||
|
||||
# 2. Aggiunge colonna extracted_text ad attachments (testo estratto da PDF/DOCX)
|
||||
op.add_column(
|
||||
'attachments',
|
||||
sa.Column('extracted_text', sa.Text(), nullable=True),
|
||||
)
|
||||
|
||||
# 3. Indice GIN per ricerca full-text veloce
|
||||
op.execute(
|
||||
"CREATE INDEX idx_messages_fts ON messages USING gin(search_vector) "
|
||||
"WHERE search_vector IS NOT NULL"
|
||||
)
|
||||
|
||||
# 4. Funzione trigger che aggiorna search_vector quando cambiano i campi testuali
|
||||
op.execute("""
|
||||
CREATE OR REPLACE FUNCTION messages_search_vector_update() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
NEW.search_vector :=
|
||||
setweight(to_tsvector('italian', coalesce(NEW.subject, '')), 'A') ||
|
||||
setweight(to_tsvector('simple', coalesce(NEW.from_address, '')), 'B') ||
|
||||
setweight(to_tsvector('simple',
|
||||
coalesce(array_to_string(NEW.to_addresses, ' '), '')), 'B') ||
|
||||
setweight(to_tsvector('italian', coalesce(NEW.body_text, '')), 'C');
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
""")
|
||||
|
||||
# 5. Crea trigger (si attiva su INSERT e UPDATE dei campi rilevanti)
|
||||
op.execute("""
|
||||
CREATE TRIGGER trg_messages_search_vector
|
||||
BEFORE INSERT OR UPDATE OF subject, from_address, to_addresses, body_text
|
||||
ON messages
|
||||
FOR EACH ROW EXECUTE FUNCTION messages_search_vector_update();
|
||||
""")
|
||||
|
||||
# 6. Backfill: popola search_vector per i messaggi esistenti
|
||||
op.execute("""
|
||||
UPDATE messages SET search_vector =
|
||||
setweight(to_tsvector('italian', coalesce(subject, '')), 'A') ||
|
||||
setweight(to_tsvector('simple', coalesce(from_address, '')), 'B') ||
|
||||
setweight(to_tsvector('simple',
|
||||
coalesce(array_to_string(to_addresses, ' '), '')), 'B') ||
|
||||
setweight(to_tsvector('italian', coalesce(body_text, '')), 'C')
|
||||
WHERE search_vector IS NULL
|
||||
""")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("DROP TRIGGER IF EXISTS trg_messages_search_vector ON messages")
|
||||
op.execute("DROP FUNCTION IF EXISTS messages_search_vector_update()")
|
||||
op.execute("DROP INDEX IF EXISTS idx_messages_fts")
|
||||
op.drop_column('attachments', 'extracted_text')
|
||||
op.drop_column('messages', 'search_vector')
|
||||
@@ -26,6 +26,8 @@ from sqlalchemy import func, or_, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.services.search_service import SearchService
|
||||
|
||||
from app.config import get_settings
|
||||
from app.core.exceptions import ForbiddenError, NotFoundError
|
||||
from app.database import get_db
|
||||
@@ -184,8 +186,11 @@ async def list_messages(
|
||||
is_starred: Optional[bool] = Query(None),
|
||||
is_archived: Optional[bool] = Query(False),
|
||||
is_trashed: Optional[bool] = Query(False),
|
||||
search: Optional[str] = Query(None, max_length=200),
|
||||
search: Optional[str] = Query(None, max_length=500),
|
||||
pec_type: Optional[str] = Query(None),
|
||||
# Filtri data (ISO 8601, es. 2026-01-01T00:00:00Z)
|
||||
date_from: Optional[datetime] = Query(None, description="Data minima (received_at o sent_at)"),
|
||||
date_to: Optional[datetime] = Query(None, description="Data massima (received_at o sent_at)"),
|
||||
# Paginazione
|
||||
page: int = Query(1, ge=1),
|
||||
page_size: int = Query(50, ge=1, le=200),
|
||||
@@ -195,7 +200,8 @@ async def list_messages(
|
||||
|
||||
- `is_archived=False` (default) esclude i messaggi archiviati.
|
||||
- `is_trashed=False` (default) esclude i messaggi nel cestino.
|
||||
- `search` cerca su subject, from_address, to_addresses.
|
||||
- `search` usa ricerca full-text (tsvector) con fallback ILIKE.
|
||||
- `date_from` / `date_to` filtrano per data ricezione o invio.
|
||||
- `vbox_id` filtra per Virtual Box assegnata all'utente corrente.
|
||||
"""
|
||||
# Determinare le caselle visibili (normale check permessi)
|
||||
@@ -284,16 +290,30 @@ async def list_messages(
|
||||
if is_trashed is not None:
|
||||
q = q.where(Message.is_trashed == is_trashed)
|
||||
|
||||
# ── Full-text search (FTS con fallback ILIKE per messaggi non indicizzati) ───
|
||||
if search:
|
||||
term = f"%{search}%"
|
||||
from sqlalchemy import case as sa_case
|
||||
|
||||
tsquery = func.websearch_to_tsquery("italian", search)
|
||||
term_like = f"%{search}%"
|
||||
q = q.where(
|
||||
or_(
|
||||
Message.subject.ilike(term),
|
||||
Message.from_address.ilike(term),
|
||||
Message.body_text.ilike(term),
|
||||
Message.search_vector.op("@@")(tsquery),
|
||||
# Fallback per messaggi non ancora indicizzati dal worker
|
||||
Message.search_vector.is_(None) & or_(
|
||||
Message.subject.ilike(term_like),
|
||||
Message.from_address.ilike(term_like),
|
||||
Message.body_text.ilike(term_like),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# ── Filtri data ───────────────────────────────────────────────────────────
|
||||
if date_from:
|
||||
q = q.where(or_(Message.received_at >= date_from, Message.sent_at >= date_from))
|
||||
if date_to:
|
||||
q = q.where(or_(Message.received_at <= date_to, Message.sent_at <= date_to))
|
||||
|
||||
# Applica le regole della Virtual Box (AND tra le regole)
|
||||
for rule in vbox_rules:
|
||||
q = _apply_vbox_rule(q, rule.field, rule.operator, rule.value)
|
||||
@@ -302,13 +322,23 @@ async def list_messages(
|
||||
count_q = select(func.count()).select_from(q.subquery())
|
||||
total = (await db.execute(count_q)).scalar_one()
|
||||
|
||||
# Ordinamento e paginazione
|
||||
# Ordinamento: se c'e' una ricerca, ordina per rilevanza FTS, poi data
|
||||
if search:
|
||||
from sqlalchemy import case as sa_case
|
||||
|
||||
tsquery_ord = func.websearch_to_tsquery("italian", search)
|
||||
rank_expr = sa_case(
|
||||
(Message.search_vector.isnot(None), func.ts_rank(Message.search_vector, tsquery_ord)),
|
||||
else_=0.0,
|
||||
)
|
||||
order_clauses = [rank_expr.desc(), Message.received_at.desc().nullslast(), Message.created_at.desc()]
|
||||
else:
|
||||
order_clauses = [Message.received_at.desc().nullslast(), Message.created_at.desc()]
|
||||
|
||||
# Paginazione
|
||||
q = (
|
||||
q.options(selectinload(Message.labels))
|
||||
.order_by(
|
||||
Message.received_at.desc().nullslast(),
|
||||
Message.created_at.desc(),
|
||||
)
|
||||
.order_by(*order_clauses)
|
||||
.offset((page - 1) * page_size)
|
||||
.limit(page_size)
|
||||
)
|
||||
|
||||
@@ -4,6 +4,7 @@ Modelli Message, Attachment, SendJob.
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import (
|
||||
ARRAY,
|
||||
@@ -18,7 +19,7 @@ from sqlalchemy import (
|
||||
Text,
|
||||
func,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
from sqlalchemy.dialects.postgresql import TSVECTOR, UUID
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.database import Base
|
||||
@@ -96,6 +97,9 @@ class Message(Base):
|
||||
|
||||
raw_eml_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
# Full-text search vector (aggiornato da trigger DB + worker per allegati)
|
||||
search_vector: Mapped[Any | None] = mapped_column(TSVECTOR(), nullable=True)
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), nullable=False, server_default=func.now()
|
||||
)
|
||||
@@ -126,6 +130,7 @@ class Message(Base):
|
||||
postgresql_where="parent_message_id IS NOT NULL",
|
||||
),
|
||||
Index("idx_messages_imap_uid", "mailbox_id", "imap_uid"),
|
||||
Index("idx_messages_fts", "search_vector", postgresql_using="gin"),
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
@@ -149,6 +154,8 @@ class Attachment(Base):
|
||||
size_bytes: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
storage_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
checksum_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
# Testo estratto dal worker (solo PDF e DOCX) per la ricerca full-text
|
||||
extracted_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), nullable=False, server_default=func.now()
|
||||
)
|
||||
|
||||
@@ -0,0 +1,139 @@
|
||||
"""
|
||||
Servizio di ricerca full-text per i messaggi PEC.
|
||||
|
||||
Utilizza i vettori tsvector di PostgreSQL per ricerche veloci su:
|
||||
- oggetto (peso A)
|
||||
- mittente / destinatari (peso B)
|
||||
- corpo del messaggio (peso C)
|
||||
- testo estratto dagli allegati PDF/DOCX (peso D)
|
||||
|
||||
Se search_vector e' NULL (messaggio non ancora indicizzato dal worker),
|
||||
cade back automaticamente a ILIKE sulle colonne base.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import case, func, or_, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.models.message import Message
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""Incapsula la logica di ricerca full-text sui messaggi."""
|
||||
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self.db = db
|
||||
|
||||
async def search_messages(
|
||||
self,
|
||||
tenant_id: uuid.UUID,
|
||||
search_term: str,
|
||||
visible_mailbox_ids: Optional[list[uuid.UUID]],
|
||||
mailbox_id: Optional[uuid.UUID] = None,
|
||||
direction: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
pec_type: Optional[str] = None,
|
||||
date_from: Optional[datetime] = None,
|
||||
date_to: Optional[datetime] = None,
|
||||
is_archived: Optional[bool] = False,
|
||||
is_trashed: Optional[bool] = False,
|
||||
is_starred: Optional[bool] = None,
|
||||
is_read: Optional[bool] = None,
|
||||
page: int = 1,
|
||||
page_size: int = 50,
|
||||
) -> tuple[list[Message], int]:
|
||||
"""
|
||||
Ricerca full-text nei messaggi.
|
||||
|
||||
Logica:
|
||||
1. Messaggi con search_vector non NULL → usa @@ operator + ts_rank
|
||||
2. Messaggi con search_vector NULL → fallback ILIKE (non ancora indicizzati)
|
||||
3. Applica tutti i filtri aggiuntivi (data, stato, tipo, direzione, ecc.)
|
||||
4. Ordina per rilevanza FTS desc, poi per data desc
|
||||
"""
|
||||
q = select(Message).where(
|
||||
Message.tenant_id == tenant_id,
|
||||
Message.parent_message_id.is_(None),
|
||||
)
|
||||
|
||||
# Restrizione caselle visibili (permessi)
|
||||
if visible_mailbox_ids is not None:
|
||||
if not visible_mailbox_ids:
|
||||
return [], 0
|
||||
q = q.where(Message.mailbox_id.in_(visible_mailbox_ids))
|
||||
|
||||
# Filtri opzionali
|
||||
if mailbox_id:
|
||||
q = q.where(Message.mailbox_id == mailbox_id)
|
||||
if direction:
|
||||
q = q.where(Message.direction == direction)
|
||||
if state:
|
||||
q = q.where(Message.state == state)
|
||||
if pec_type:
|
||||
q = q.where(Message.pec_type == pec_type)
|
||||
if is_archived is not None:
|
||||
q = q.where(Message.is_archived == is_archived)
|
||||
if is_trashed is not None:
|
||||
q = q.where(Message.is_trashed == is_trashed)
|
||||
if is_starred is not None:
|
||||
q = q.where(Message.is_starred == is_starred)
|
||||
if is_read is not None:
|
||||
q = q.where(Message.is_read == is_read)
|
||||
|
||||
# Filtri data: cerca sia su received_at che su sent_at
|
||||
if date_from:
|
||||
q = q.where(
|
||||
or_(
|
||||
Message.received_at >= date_from,
|
||||
Message.sent_at >= date_from,
|
||||
)
|
||||
)
|
||||
if date_to:
|
||||
q = q.where(
|
||||
or_(
|
||||
Message.received_at <= date_to,
|
||||
Message.sent_at <= date_to,
|
||||
)
|
||||
)
|
||||
|
||||
# Full-text search con fallback ILIKE
|
||||
tsquery = func.websearch_to_tsquery("italian", search_term)
|
||||
term_like = f"%{search_term}%"
|
||||
|
||||
fts_condition = Message.search_vector.op("@@")(tsquery)
|
||||
ilike_fallback = Message.search_vector.is_(None) & or_(
|
||||
Message.subject.ilike(term_like),
|
||||
Message.from_address.ilike(term_like),
|
||||
Message.body_text.ilike(term_like),
|
||||
)
|
||||
|
||||
q = q.where(or_(fts_condition, ilike_fallback))
|
||||
|
||||
# Conteggio totale (senza paginazione)
|
||||
count_q = select(func.count()).select_from(q.subquery())
|
||||
total: int = (await self.db.execute(count_q)).scalar_one()
|
||||
|
||||
# Ordinamento per rilevanza FTS, poi data
|
||||
rank_expr = case(
|
||||
(Message.search_vector.isnot(None), func.ts_rank(Message.search_vector, tsquery)),
|
||||
else_=0.0,
|
||||
)
|
||||
|
||||
q = (
|
||||
q.options(selectinload(Message.labels))
|
||||
.order_by(
|
||||
rank_expr.desc(),
|
||||
Message.received_at.desc().nullslast(),
|
||||
Message.created_at.desc(),
|
||||
)
|
||||
.offset((page - 1) * page_size)
|
||||
.limit(page_size)
|
||||
)
|
||||
|
||||
result = await self.db.execute(q)
|
||||
items = list(result.scalars().all())
|
||||
return items, total
|
||||
Reference in New Issue
Block a user