Semantic search

This commit is contained in:
2026-03-25 18:39:50 +01:00
parent f5fb537fed
commit cbeedc2d2f
14 changed files with 1336 additions and 56 deletions
+41 -11
View File
@@ -26,6 +26,8 @@ from sqlalchemy import func, or_, select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from app.services.search_service import SearchService
from app.config import get_settings
from app.core.exceptions import ForbiddenError, NotFoundError
from app.database import get_db
@@ -184,8 +186,11 @@ async def list_messages(
is_starred: Optional[bool] = Query(None),
is_archived: Optional[bool] = Query(False),
is_trashed: Optional[bool] = Query(False),
search: Optional[str] = Query(None, max_length=200),
search: Optional[str] = Query(None, max_length=500),
pec_type: Optional[str] = Query(None),
# Filtri data (ISO 8601, es. 2026-01-01T00:00:00Z)
date_from: Optional[datetime] = Query(None, description="Data minima (received_at o sent_at)"),
date_to: Optional[datetime] = Query(None, description="Data massima (received_at o sent_at)"),
# Paginazione
page: int = Query(1, ge=1),
page_size: int = Query(50, ge=1, le=200),
@@ -195,7 +200,8 @@ async def list_messages(
- `is_archived=False` (default) esclude i messaggi archiviati.
- `is_trashed=False` (default) esclude i messaggi nel cestino.
- `search` cerca su subject, from_address, to_addresses.
- `search` usa ricerca full-text (tsvector) con fallback ILIKE.
- `date_from` / `date_to` filtrano per data ricezione o invio.
- `vbox_id` filtra per Virtual Box assegnata all'utente corrente.
"""
# Determinare le caselle visibili (normale check permessi)
@@ -284,16 +290,30 @@ async def list_messages(
if is_trashed is not None:
q = q.where(Message.is_trashed == is_trashed)
# ── Full-text search (FTS con fallback ILIKE per messaggi non indicizzati) ───
if search:
term = f"%{search}%"
from sqlalchemy import case as sa_case
tsquery = func.websearch_to_tsquery("italian", search)
term_like = f"%{search}%"
q = q.where(
or_(
Message.subject.ilike(term),
Message.from_address.ilike(term),
Message.body_text.ilike(term),
Message.search_vector.op("@@")(tsquery),
# Fallback per messaggi non ancora indicizzati dal worker
Message.search_vector.is_(None) & or_(
Message.subject.ilike(term_like),
Message.from_address.ilike(term_like),
Message.body_text.ilike(term_like),
),
)
)
# ── Filtri data ───────────────────────────────────────────────────────────
if date_from:
q = q.where(or_(Message.received_at >= date_from, Message.sent_at >= date_from))
if date_to:
q = q.where(or_(Message.received_at <= date_to, Message.sent_at <= date_to))
# Applica le regole della Virtual Box (AND tra le regole)
for rule in vbox_rules:
q = _apply_vbox_rule(q, rule.field, rule.operator, rule.value)
@@ -302,13 +322,23 @@ async def list_messages(
count_q = select(func.count()).select_from(q.subquery())
total = (await db.execute(count_q)).scalar_one()
# Ordinamento e paginazione
# Ordinamento: se c'e' una ricerca, ordina per rilevanza FTS, poi data
if search:
from sqlalchemy import case as sa_case
tsquery_ord = func.websearch_to_tsquery("italian", search)
rank_expr = sa_case(
(Message.search_vector.isnot(None), func.ts_rank(Message.search_vector, tsquery_ord)),
else_=0.0,
)
order_clauses = [rank_expr.desc(), Message.received_at.desc().nullslast(), Message.created_at.desc()]
else:
order_clauses = [Message.received_at.desc().nullslast(), Message.created_at.desc()]
# Paginazione
q = (
q.options(selectinload(Message.labels))
.order_by(
Message.received_at.desc().nullslast(),
Message.created_at.desc(),
)
.order_by(*order_clauses)
.offset((page - 1) * page_size)
.limit(page_size)
)
+8 -1
View File
@@ -4,6 +4,7 @@ Modelli Message, Attachment, SendJob.
import uuid
from datetime import datetime
from typing import Any
from sqlalchemy import (
ARRAY,
@@ -18,7 +19,7 @@ from sqlalchemy import (
Text,
func,
)
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.dialects.postgresql import TSVECTOR, UUID
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.database import Base
@@ -96,6 +97,9 @@ class Message(Base):
raw_eml_path: Mapped[str | None] = mapped_column(Text, nullable=True)
# Full-text search vector (aggiornato da trigger DB + worker per allegati)
search_vector: Mapped[Any | None] = mapped_column(TSVECTOR(), nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), nullable=False, server_default=func.now()
)
@@ -126,6 +130,7 @@ class Message(Base):
postgresql_where="parent_message_id IS NOT NULL",
),
Index("idx_messages_imap_uid", "mailbox_id", "imap_uid"),
Index("idx_messages_fts", "search_vector", postgresql_using="gin"),
)
def __repr__(self) -> str:
@@ -149,6 +154,8 @@ class Attachment(Base):
size_bytes: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
storage_path: Mapped[str] = mapped_column(Text, nullable=False)
checksum_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
# Testo estratto dal worker (solo PDF e DOCX) per la ricerca full-text
extracted_text: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), nullable=False, server_default=func.now()
)
+139
View File
@@ -0,0 +1,139 @@
"""
Servizio di ricerca full-text per i messaggi PEC.
Utilizza i vettori tsvector di PostgreSQL per ricerche veloci su:
- oggetto (peso A)
- mittente / destinatari (peso B)
- corpo del messaggio (peso C)
- testo estratto dagli allegati PDF/DOCX (peso D)
Se search_vector e' NULL (messaggio non ancora indicizzato dal worker),
cade back automaticamente a ILIKE sulle colonne base.
"""
import uuid
from datetime import datetime
from typing import Optional
from sqlalchemy import case, func, or_, select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from app.models.message import Message
class SearchService:
"""Incapsula la logica di ricerca full-text sui messaggi."""
def __init__(self, db: AsyncSession) -> None:
self.db = db
async def search_messages(
self,
tenant_id: uuid.UUID,
search_term: str,
visible_mailbox_ids: Optional[list[uuid.UUID]],
mailbox_id: Optional[uuid.UUID] = None,
direction: Optional[str] = None,
state: Optional[str] = None,
pec_type: Optional[str] = None,
date_from: Optional[datetime] = None,
date_to: Optional[datetime] = None,
is_archived: Optional[bool] = False,
is_trashed: Optional[bool] = False,
is_starred: Optional[bool] = None,
is_read: Optional[bool] = None,
page: int = 1,
page_size: int = 50,
) -> tuple[list[Message], int]:
"""
Ricerca full-text nei messaggi.
Logica:
1. Messaggi con search_vector non NULL → usa @@ operator + ts_rank
2. Messaggi con search_vector NULL → fallback ILIKE (non ancora indicizzati)
3. Applica tutti i filtri aggiuntivi (data, stato, tipo, direzione, ecc.)
4. Ordina per rilevanza FTS desc, poi per data desc
"""
q = select(Message).where(
Message.tenant_id == tenant_id,
Message.parent_message_id.is_(None),
)
# Restrizione caselle visibili (permessi)
if visible_mailbox_ids is not None:
if not visible_mailbox_ids:
return [], 0
q = q.where(Message.mailbox_id.in_(visible_mailbox_ids))
# Filtri opzionali
if mailbox_id:
q = q.where(Message.mailbox_id == mailbox_id)
if direction:
q = q.where(Message.direction == direction)
if state:
q = q.where(Message.state == state)
if pec_type:
q = q.where(Message.pec_type == pec_type)
if is_archived is not None:
q = q.where(Message.is_archived == is_archived)
if is_trashed is not None:
q = q.where(Message.is_trashed == is_trashed)
if is_starred is not None:
q = q.where(Message.is_starred == is_starred)
if is_read is not None:
q = q.where(Message.is_read == is_read)
# Filtri data: cerca sia su received_at che su sent_at
if date_from:
q = q.where(
or_(
Message.received_at >= date_from,
Message.sent_at >= date_from,
)
)
if date_to:
q = q.where(
or_(
Message.received_at <= date_to,
Message.sent_at <= date_to,
)
)
# Full-text search con fallback ILIKE
tsquery = func.websearch_to_tsquery("italian", search_term)
term_like = f"%{search_term}%"
fts_condition = Message.search_vector.op("@@")(tsquery)
ilike_fallback = Message.search_vector.is_(None) & or_(
Message.subject.ilike(term_like),
Message.from_address.ilike(term_like),
Message.body_text.ilike(term_like),
)
q = q.where(or_(fts_condition, ilike_fallback))
# Conteggio totale (senza paginazione)
count_q = select(func.count()).select_from(q.subquery())
total: int = (await self.db.execute(count_q)).scalar_one()
# Ordinamento per rilevanza FTS, poi data
rank_expr = case(
(Message.search_vector.isnot(None), func.ts_rank(Message.search_vector, tsquery)),
else_=0.0,
)
q = (
q.options(selectinload(Message.labels))
.order_by(
rank_expr.desc(),
Message.received_at.desc().nullslast(),
Message.created_at.desc(),
)
.offset((page - 1) * page_size)
.limit(page_size)
)
result = await self.db.execute(q)
items = list(result.scalars().all())
return items, total