Semantic search

2026-06-16 12:45:42 +02:00 · 2026-03-25 18:39:50 +01:00
parent f5fb537fed
commit cbeedc2d2f
14 changed files with 1336 additions and 56 deletions
@@ -35,6 +35,7 @@ from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.config import get_settings
+from app.jobs.index_message import index_message
 from app.models import Attachment, Mailbox, Message
 from app.parsers.eml_parser import parse_eml
 from app.parsers.pec_parser import apply_outbound_transition, classify_pec_message
@@ -635,6 +636,12 @@ async def _save_message(
        f"direction={direction!r} pec_type={pec_class.pec_type!r} "
        f"subject={message.subject!r} allegati={len(parsed.attachments)}"
    )
+
+    # ── Indicizzazione full-text (non bloccante, non interrompe la sync) ─────
+    # Chiamata dopo il flush degli allegati: index_message puo' leggere
+    # sia il messaggio che gli allegati dalla sessione corrente.
+    await index_message(message.id, db)
+
    return True


@@ -0,0 +1,218 @@
+"""
+Indicizzazione full-text dei messaggi PEC.
+
+Responsabilita':
+  1. Scarica gli allegati PDF e DOCX da MinIO
+  2. Estrae il testo con pypdf (PDF) e python-docx (DOCX)
+  3. Aggiorna la colonna extracted_text in attachments
+  4. Aggiorna la colonna search_vector in messages includendo il testo degli allegati
+
+Viene chiamato alla fine di _save_message in sync.py, in modo non bloccante:
+un'eccezione qui non interrompe la sincronizzazione del messaggio.
+"""
+
+import io
+import logging
+import uuid
+
+from sqlalchemy import select, text
+from sqlalchemy.ext.asyncio import AsyncSession
+
+logger = logging.getLogger(__name__)
+
+# Dimensione massima del testo estratto per allegato (caratteri)
+MAX_EXTRACTED_TEXT_LEN = 50_000
+# Dimensione massima del testo aggregato degli allegati per il search_vector
+MAX_COMBINED_TEXT_LEN = 200_000
+
+
+# ─── Estrazione testo ─────────────────────────────────────────────────────────
+
+def _extract_pdf_text(content: bytes) -> str:
+    """Estrae testo da un PDF usando pypdf."""
+    try:
+        import pypdf  # type: ignore[import]
+
+        reader = pypdf.PdfReader(io.BytesIO(content))
+        parts: list[str] = []
+        for page in reader.pages:
+            try:
+                txt = page.extract_text()
+                if txt:
+                    parts.append(txt)
+            except Exception:
+                continue
+        return " ".join(parts)
+    except ImportError:
+        logger.warning("pypdf non installato: impossibile estrarre testo da PDF")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione testo PDF: {e}")
+        return ""
+
+
+def _extract_docx_text(content: bytes) -> str:
+    """Estrae testo da un DOCX usando python-docx."""
+    try:
+        import docx  # type: ignore[import]
+
+        doc = docx.Document(io.BytesIO(content))
+        parts = [para.text for para in doc.paragraphs if para.text and para.text.strip()]
+        return " ".join(parts)
+    except ImportError:
+        logger.warning("python-docx non installato: impossibile estrarre testo da DOCX")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione testo DOCX: {e}")
+        return ""
+
+
+def _is_pdf(content_type: str | None, filename: str | None) -> bool:
+    ct = (content_type or "").lower()
+    fn = (filename or "").lower()
+    return ct == "application/pdf" or fn.endswith(".pdf")
+
+
+def _is_docx(content_type: str | None, filename: str | None) -> bool:
+    ct = (content_type or "").lower()
+    fn = (filename or "").lower()
+    return ct in (
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/msword",
+        "application/vnd.ms-word",
+    ) or fn.endswith((".docx", ".doc"))
+
+
+# ─── Job principale ───────────────────────────────────────────────────────────
+
+async def index_message(
+    message_id: uuid.UUID,
+    db: AsyncSession,
+) -> None:
+    """
+    Indicizza un messaggio per la ricerca full-text.
+
+    Non solleva eccezioni: tutti gli errori vengono loggati ma non propagati,
+    per non interrompere il flusso di sincronizzazione.
+    """
+    try:
+        await _do_index_message(message_id, db)
+    except Exception as e:
+        logger.error(
+            f"Errore indicizzazione messaggio {message_id}: {e}",
+            exc_info=True,
+        )
+
+
+async def _do_index_message(
+    message_id: uuid.UUID,
+    db: AsyncSession,
+) -> None:
+    """Logica interna di indicizzazione (puo' sollevare eccezioni)."""
+    from app.config import get_settings
+    from app.models import Attachment, Message
+
+    settings = get_settings()
+
+    # ── Carica il messaggio ───────────────────────────────────────────────────
+    msg_result = await db.execute(
+        select(Message).where(Message.id == message_id)
+    )
+    message = msg_result.scalar_one_or_none()
+    if not message:
+        logger.warning(f"index_message: messaggio {message_id} non trovato in DB")
+        return
+
+    # ── Carica gli allegati ───────────────────────────────────────────────────
+    att_result = await db.execute(
+        select(Attachment).where(Attachment.message_id == message_id)
+    )
+    attachments = list(att_result.scalars().all())
+
+    if not attachments:
+        logger.debug(f"Messaggio {message_id}: nessun allegato, skip indicizzazione allegati")
+        return
+
+    # ── Crea client MinIO ─────────────────────────────────────────────────────
+    try:
+        from miniopy_async import Minio  # type: ignore[import]
+
+        minio = Minio(
+            endpoint=settings.minio_endpoint,
+            access_key=settings.minio_access_key,
+            secret_key=settings.minio_secret_key,
+            secure=settings.minio_use_ssl,
+        )
+    except Exception as e:
+        logger.warning(f"Impossibile creare client MinIO per indicizzazione {message_id}: {e}")
+        return
+
+    bucket = settings.minio_bucket
+    attachment_texts: list[str] = []
+    indexed_count = 0
+
+    for att in attachments:
+        # Se gia' indicizzato, usa il testo cached
+        if att.extracted_text is not None:
+            attachment_texts.append(att.extracted_text)
+            continue
+
+        # Controlla se e' un PDF o DOCX
+        if not (_is_pdf(att.content_type, att.filename) or _is_docx(att.content_type, att.filename)):
+            continue
+
+        # Scarica da MinIO
+        try:
+            response = await minio.get_object(bucket, att.storage_path)
+            content = await response.content.read()
+            response.close()
+        except Exception as e:
+            logger.warning(
+                f"Impossibile scaricare allegato {att.id} "
+                f"({att.filename!r}) da MinIO: {e}"
+            )
+            continue
+
+        # Estrai testo
+        if _is_pdf(att.content_type, att.filename):
+            extracted = _extract_pdf_text(content)
+        else:
+            extracted = _extract_docx_text(content)
+
+        if not extracted or not extracted.strip():
+            continue
+
+        # Limita la dimensione e salva
+        att.extracted_text = extracted[:MAX_EXTRACTED_TEXT_LEN]
+        attachment_texts.append(att.extracted_text)
+        indexed_count += 1
+
+    # ── Aggiorna search_vector includendo il testo degli allegati ─────────────
+    if attachment_texts:
+        combined = " ".join(attachment_texts)[:MAX_COMBINED_TEXT_LEN]
+
+        await db.execute(
+            text("""
+                UPDATE messages
+                SET search_vector =
+                    setweight(to_tsvector('italian', coalesce(subject, '')), 'A') ||
+                    setweight(to_tsvector('simple',  coalesce(from_address, '')), 'B') ||
+                    setweight(to_tsvector('simple',
+                        coalesce(array_to_string(to_addresses, ' '), '')), 'B') ||
+                    setweight(to_tsvector('italian', coalesce(body_text, '')), 'C') ||
+                    setweight(to_tsvector('italian', :att_text), 'D')
+                WHERE id = :message_id
+            """),
+            {"att_text": combined, "message_id": str(message_id)},
+        )
+
+        await db.flush()
+
+        logger.info(
+            f"Indicizzazione completata: messaggio {message_id}, "
+            f"{indexed_count} allegati indicizzati su {len(attachments)} totali"
+        )
+    else:
+        logger.debug(
+            f"Messaggio {message_id}: nessun allegato PDF/DOCX con testo estraibile"
+        )