OCR + reportistica

2026-06-16 12:45:42 +02:00 · 2026-03-27 13:54:07 +01:00
parent cbeedc2d2f
commit bb2060c1ae
26 changed files with 5503 additions and 237 deletions
@@ -5,6 +5,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    gcc \
    libpq-dev \
    curl \
+    tesseract-ocr \
+    tesseract-ocr-ita \
+    tesseract-ocr-eng \
+    poppler-utils \
    && rm -rf /var/lib/apt/lists/*

 WORKDIR /worker
@@ -540,11 +540,15 @@ async def _save_message(
        logger.debug(f"[{mailbox.email_address}] UID {uid} in {imap_folder!r} già in DB, skip")
        return False

-    # ── Parsing completo EML ──────────────────────────────────────────────────
-    parsed = parse_eml(raw_eml)
-    pec_class = classify_pec_message(
-        parsed.raw_message or email.message_from_bytes(raw_eml)
-    )
+    # ── Classificazione PEC da header (veloce, senza body) ───────────────────
+    # La classificazione avviene PRIMA del parsing completo perche' il parser
+    # deve sapere se il messaggio e' una ricevuta per evitare di sovrascrivere
+    # il body_text (testo della ricevuta) con il contenuto di postacert.eml.
+    quick_msg = email.message_from_bytes(raw_eml)
+    pec_class = classify_pec_message(quick_msg)
+
+    # ── Parsing completo EML (con is_receipt per proteggere il body) ──────────
+    parsed = parse_eml(raw_eml, is_receipt=pec_class.is_receipt)
    received_at = datetime.now(UTC)

    # ── State machine: trova e aggiorna messaggio outbound ────────────────────
@@ -2,17 +2,29 @@
 Indicizzazione full-text dei messaggi PEC.

 Responsabilita':
-  1. Scarica gli allegati PDF e DOCX da MinIO
-  2. Estrae il testo con pypdf (PDF) e python-docx (DOCX)
+  1. Scarica gli allegati da MinIO
+  2. Estrae il testo in base al formato del file
  3. Aggiorna la colonna extracted_text in attachments
  4. Aggiorna la colonna search_vector in messages includendo il testo degli allegati

+Formati supportati:
+  - PDF       (.pdf)           tramite pypdf
+  - Word      (.docx, .doc)    tramite python-docx
+  - Excel     (.xlsx, .xls)    tramite openpyxl
+  - PowerPoint(.pptx, .ppt)    tramite python-pptx
+  - LibreOffice (.odt, .ods, .odp) tramite odfpy
+  - RTF       (.rtf)           tramite striprtf
+  - Testo     (.txt, .csv, .xml, .html, .htm) testo grezzo
+  - Email     (.eml, .msg)     tramite stdlib email
+  - Firmati   (.p7m)           unwrap CMS poi estrae in base all'estensione interna
+
 Viene chiamato alla fine di _save_message in sync.py, in modo non bloccante:
 un'eccezione qui non interrompe la sincronizzazione del messaggio.
 """

 import io
 import logging
+import re
 import uuid

 from sqlalchemy import select, text
@@ -26,61 +38,534 @@ MAX_EXTRACTED_TEXT_LEN = 50_000
 MAX_COMBINED_TEXT_LEN = 200_000


-# ─── Estrazione testo ─────────────────────────────────────────────────────────
+# ─── Rilevamento tipo file ────────────────────────────────────────────────────

-def _extract_pdf_text(content: bytes) -> str:
-    """Estrae testo da un PDF usando pypdf."""
+def _ext(filename: str | None) -> str:
+    """Restituisce l'estensione del file in minuscolo, senza punto."""
+    if not filename:
+        return ""
+    fn = filename.lower()
+    # Gestione doppia estensione es. documento.pdf.p7m
+    if fn.endswith(".p7m"):
+        return "p7m"
+    idx = fn.rfind(".")
+    return fn[idx + 1:] if idx >= 0 else ""
+
+
+def _is_extractable(content_type: str | None, filename: str | None) -> bool:
+    """Ritorna True se il formato e' supportato dall'estrattore."""
+    ct = (content_type or "").lower()
+    e = _ext(filename)
+    return e in _EXTRACTORS or ct in _CONTENT_TYPE_MAP
+
+
+def _resolve_extractor(content_type: str | None, filename: str | None):
+    """Ritorna la funzione estrattore appropriata, o None."""
+    e = _ext(filename)
+    if e in _EXTRACTORS:
+        return _EXTRACTORS[e]
+    ct = (content_type or "").lower()
+    if ct in _CONTENT_TYPE_MAP:
+        return _EXTRACTORS.get(_CONTENT_TYPE_MAP[ct])
+    return None
+
+
+# ─── Estrattori ───────────────────────────────────────────────────────────────
+
+# Soglia minima di caratteri estratti da pypdf prima di ricorrere all'OCR.
+# Un PDF di testo reale produce migliaia di caratteri; una scansione ne produce
+# zero o pochissimi (artefatti). 50 char e' un valore conservativo sicuro.
+_PDF_OCR_THRESHOLD = 50
+
+# Numero massimo di pagine su cui eseguire OCR per evitare timeout su PDF lunghi.
+_PDF_OCR_MAX_PAGES = 15
+
+
+def _extract_pdf(content: bytes) -> str:
+    """
+    Estrae testo da PDF tramite pypdf.
+
+    Se il testo estratto e' inferiore a _PDF_OCR_THRESHOLD caratteri (PDF
+    image-only / scansione), attiva il fallback OCR via Tesseract.
+    """
    try:
        import pypdf  # type: ignore[import]
-
        reader = pypdf.PdfReader(io.BytesIO(content))
        parts: list[str] = []
        for page in reader.pages:
            try:
-                txt = page.extract_text()
-                if txt:
-                    parts.append(txt)
+                t = page.extract_text()
+                if t:
+                    parts.append(t)
            except Exception:
                continue
-        return " ".join(parts)
+        text = " ".join(parts)
    except ImportError:
        logger.warning("pypdf non installato: impossibile estrarre testo da PDF")
        return ""
    except Exception as e:
-        logger.debug(f"Errore estrazione testo PDF: {e}")
+        logger.debug(f"Errore estrazione PDF: {e}")
+        return ""
+
+    # Se il testo e' troppo corto, il PDF e' probabilmente una scansione
+    if len(text.strip()) < _PDF_OCR_THRESHOLD:
+        logger.debug(
+            f"PDF con testo insufficiente ({len(text.strip())} char), "
+            "tentativo OCR..."
+        )
+        ocr_text = _extract_pdf_ocr(content)
+        if len(ocr_text.strip()) > len(text.strip()):
+            return ocr_text
+
+    return text
+
+
+def _extract_pdf_ocr(content: bytes) -> str:
+    """
+    OCR su PDF image-only tramite pdf2image + Tesseract.
+
+    Converte le pagine del PDF in immagini PIL a 200 DPI (buon compromesso
+    qualita'/velocita' su CPU) e applica Tesseract con lingua italiana + inglese.
+    Processa al massimo _PDF_OCR_MAX_PAGES pagine per evitare timeout.
+    """
+    try:
+        from pdf2image import convert_from_bytes  # type: ignore[import]
+        import pytesseract  # type: ignore[import]
+
+        pages = convert_from_bytes(
+            content,
+            dpi=200,
+            last_page=_PDF_OCR_MAX_PAGES,
+        )
+        parts: list[str] = []
+        for page_img in pages:
+            try:
+                t = pytesseract.image_to_string(page_img, lang="ita+eng")
+                if t and t.strip():
+                    parts.append(t.strip())
+            except Exception:
+                continue
+        return " ".join(parts)
+    except ImportError:
+        logger.warning(
+            "pdf2image o pytesseract non installati: impossibile OCR PDF"
+        )
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore OCR PDF: {e}")
        return ""


-def _extract_docx_text(content: bytes) -> str:
-    """Estrae testo da un DOCX usando python-docx."""
+def _extract_image_ocr(content: bytes) -> str:
+    """
+    Estrae testo da un file immagine (PNG, JPEG, TIFF, BMP, ecc.) tramite OCR.
+
+    Usa Tesseract con lingua italiana + inglese per massima copertura
+    su documenti italiani.
+    """
+    try:
+        import pytesseract  # type: ignore[import]
+        from PIL import Image  # type: ignore[import]
+
+        img = Image.open(io.BytesIO(content))
+        # Converti in RGB se necessario (TIFF multi-frame, palette, ecc.)
+        if img.mode not in ("RGB", "L"):
+            img = img.convert("RGB")
+        text = pytesseract.image_to_string(img, lang="ita+eng")
+        return " ".join(text.split())
+    except ImportError:
+        logger.warning(
+            "pytesseract o Pillow non installati: impossibile OCR immagine"
+        )
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore OCR immagine: {e}")
+        return ""
+
+
+def _extract_docx(content: bytes) -> str:
+    """Estrae testo da DOCX/DOC tramite python-docx."""
    try:
        import docx  # type: ignore[import]
-
        doc = docx.Document(io.BytesIO(content))
-        parts = [para.text for para in doc.paragraphs if para.text and para.text.strip()]
+        parts = [p.text for p in doc.paragraphs if p.text and p.text.strip()]
+        # Include anche le tabelle
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    if cell.text and cell.text.strip():
+                        parts.append(cell.text.strip())
        return " ".join(parts)
    except ImportError:
        logger.warning("python-docx non installato: impossibile estrarre testo da DOCX")
        return ""
    except Exception as e:
-        logger.debug(f"Errore estrazione testo DOCX: {e}")
+        logger.debug(f"Errore estrazione DOCX: {e}")
        return ""


-def _is_pdf(content_type: str | None, filename: str | None) -> bool:
-    ct = (content_type or "").lower()
-    fn = (filename or "").lower()
-    return ct == "application/pdf" or fn.endswith(".pdf")
+def _extract_xlsx(content: bytes) -> str:
+    """Estrae testo da XLSX/XLS tramite openpyxl."""
+    try:
+        import openpyxl  # type: ignore[import]
+        wb = openpyxl.load_workbook(io.BytesIO(content), read_only=True, data_only=True)
+        parts: list[str] = []
+        for ws in wb.worksheets:
+            for row in ws.iter_rows():
+                for cell in row:
+                    if cell.value is not None:
+                        v = str(cell.value).strip()
+                        if v:
+                            parts.append(v)
+        return " ".join(parts)
+    except ImportError:
+        logger.warning("openpyxl non installato: impossibile estrarre testo da XLSX")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione XLSX: {e}")
+        return ""


-def _is_docx(content_type: str | None, filename: str | None) -> bool:
-    ct = (content_type or "").lower()
-    fn = (filename or "").lower()
-    return ct in (
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/msword",
-        "application/vnd.ms-word",
-    ) or fn.endswith((".docx", ".doc"))
+def _extract_pptx(content: bytes) -> str:
+    """Estrae testo da PPTX/PPT tramite python-pptx."""
+    try:
+        from pptx import Presentation  # type: ignore[import]
+        prs = Presentation(io.BytesIO(content))
+        parts: list[str] = []
+        for slide in prs.slides:
+            for shape in slide.shapes:
+                if shape.has_text_frame:
+                    for para in shape.text_frame.paragraphs:
+                        t = para.text.strip()
+                        if t:
+                            parts.append(t)
+        return " ".join(parts)
+    except ImportError:
+        logger.warning("python-pptx non installato: impossibile estrarre testo da PPTX")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione PPTX: {e}")
+        return ""
+
+
+def _extract_odt(content: bytes) -> str:
+    """Estrae testo da ODT/ODS/ODP tramite odfpy."""
+    try:
+        from odf import opendocument, teletype  # type: ignore[import]
+        from odf.text import P  # type: ignore[import]
+        doc = opendocument.load(io.BytesIO(content))
+        parts: list[str] = []
+        for el in doc.body.getElementsByType(P):
+            t = teletype.extractText(el).strip()
+            if t:
+                parts.append(t)
+        return " ".join(parts)
+    except ImportError:
+        logger.warning("odfpy non installato: impossibile estrarre testo da ODT")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione ODT: {e}")
+        return ""
+
+
+def _extract_rtf(content: bytes) -> str:
+    """Estrae testo da RTF tramite striprtf."""
+    try:
+        from striprtf.striprtf import rtf_to_text  # type: ignore[import]
+        raw = content.decode("latin-1", errors="replace")
+        return rtf_to_text(raw)
+    except ImportError:
+        logger.warning("striprtf non installato: impossibile estrarre testo da RTF")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione RTF: {e}")
+        return ""
+
+
+def _extract_plain(content: bytes) -> str:
+    """Estrae testo da file di testo puro (txt, csv, xml, html, ecc.)."""
+    try:
+        # Prova UTF-8 prima, poi latin-1 come fallback
+        try:
+            text = content.decode("utf-8")
+        except UnicodeDecodeError:
+            text = content.decode("latin-1", errors="replace")
+        # Per XML/HTML: rimuove i tag
+        if "<" in text and ">" in text:
+            text = re.sub(r"<[^>]+>", " ", text)
+            text = re.sub(r"&[a-zA-Z]+;", " ", text)
+        return " ".join(text.split())
+    except Exception as e:
+        logger.debug(f"Errore estrazione testo plain: {e}")
+        return ""
+
+
+def _extract_eml(content: bytes) -> str:
+    """Estrae testo da un file EML allegato."""
+    try:
+        import email as emaillib
+        msg = emaillib.message_from_bytes(content)
+        parts: list[str] = []
+        subject = msg.get("Subject", "")
+        if subject:
+            parts.append(subject)
+        sender = msg.get("From", "")
+        if sender:
+            parts.append(sender)
+        # Estrae body
+        if msg.is_multipart():
+            for part in msg.walk():
+                ct = part.get_content_type()
+                if ct == "text/plain":
+                    try:
+                        payload = part.get_payload(decode=True)
+                        if payload:
+                            charset = part.get_content_charset() or "utf-8"
+                            parts.append(payload.decode(charset, errors="replace"))
+                    except Exception:
+                        pass
+        else:
+            payload = msg.get_payload(decode=True)
+            if payload:
+                charset = msg.get_content_charset() or "utf-8"
+                parts.append(payload.decode(charset, errors="replace"))  # type: ignore[arg-type]
+        return " ".join(parts)
+    except Exception as e:
+        logger.debug(f"Errore estrazione EML: {e}")
+        return ""
+
+
+def _extract_p7m(content: bytes, original_filename: str | None = None) -> str:
+    """
+    Estrae testo da un documento con firma digitale CAdES (.p7m).
+
+    Prova a fare l'unwrap del CMS envelope tramite la libreria cryptography
+    (gia' presente nel worker). Se l'unwrap ha successo, determina il formato
+    del documento interno dall'estensione del nome originale (es. fattura.pdf.p7m
+    -> PDF) e applica l'estrattore appropriato.
+    """
+    inner_content: bytes | None = None
+
+    # Metodo 1: cryptography (CMS/PKCS7)
+    try:
+        from cryptography.hazmat.primitives.serialization import pkcs7  # type: ignore[import]
+        # load_pem_pkcs7_certificates / load_der_pkcs7_certificates non espongono il payload
+        # Usiamo il modulo backend direttamente
+        from cryptography.hazmat.backends import default_backend
+        from cryptography.hazmat.primitives.asymmetric import padding as asym_padding
+        from cryptography.x509 import load_der_x509_certificate  # noqa: F401
+
+        # Prova parsing DER diretto della struttura CMS ContentInfo
+        # La struttura ASN.1 di SignedData contiene encapContentInfo -> eContent
+        from cryptography.hazmat.bindings._rust import (  # type: ignore[import]
+            x509 as rust_x509,
+        )
+        _ = rust_x509  # solo per verificare import
+    except Exception:
+        pass
+
+    # Metodo piu' semplice: parsing ASN.1 manuale per estrarre eContent
+    # La struttura DER di CMS SignedData:
+    # SEQUENCE {
+    #   OID (signedData)
+    #   [0] EXPLICIT SEQUENCE {
+    #     INTEGER (version)
+    #     SET (digestAlgorithms)
+    #     SEQUENCE (encapContentInfo) {
+    #       OID (contentType = data)
+    #       [0] EXPLICIT OCTET STRING (eContent) <- questo e' il contenuto originale
+    #     }
+    #     ...
+    #   }
+    # }
+    try:
+        inner_content = _unwrap_p7m_asn1(content)
+    except Exception as e:
+        logger.debug(f"Unwrap P7M ASN1 fallito: {e}")
+
+    if not inner_content:
+        logger.debug("Impossibile estrarre contenuto dal file .p7m")
+        return ""
+
+    # Determina l'estensione interna dal nome file originale
+    # es. "fattura.pdf.p7m" -> inner ext = "pdf"
+    inner_ext = ""
+    if original_filename:
+        fn = original_filename.lower()
+        if fn.endswith(".p7m"):
+            fn = fn[:-4]  # rimuove .p7m
+        idx = fn.rfind(".")
+        if idx >= 0:
+            inner_ext = fn[idx + 1:]
+
+    extractor = _EXTRACTORS.get(inner_ext)
+    if extractor:
+        logger.debug(f"P7M: estrazione interna come {inner_ext!r}")
+        return extractor(inner_content)
+
+    # Fallback: prova a riconoscere il formato dall'header del contenuto
+    if inner_content[:4] == b"%PDF":
+        return _extract_pdf(inner_content)
+    if inner_content[:2] in (b"PK",):  # ZIP-based (docx, xlsx, pptx, odt)
+        # Prova nell'ordine piu' comune
+        for fn in (_extract_docx, _extract_xlsx, _extract_pptx, _extract_odt):
+            result = fn(inner_content)
+            if result.strip():
+                return result
+    # Ultimo tentativo: plain text
+    return _extract_plain(inner_content)
+
+
+def _unwrap_p7m_asn1(data: bytes) -> bytes | None:
+    """
+    Parsing ASN.1 DER minimale per estrarre eContent da una struttura CMS SignedData.
+    Non verifica la firma: serve solo per l'estrazione del testo.
+    """
+    pos = 0
+    length = len(data)
+
+    def read_tag_length(buf: bytes, offset: int) -> tuple[int, int, int]:
+        """Ritorna (tag, length, new_offset)."""
+        tag = buf[offset]
+        offset += 1
+        lb = buf[offset]
+        offset += 1
+        if lb & 0x80:
+            num_bytes = lb & 0x7F
+            ln = int.from_bytes(buf[offset:offset + num_bytes], "big")
+            offset += num_bytes
+        else:
+            ln = lb
+        return tag, ln, offset
+
+    # outer SEQUENCE
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x30:
+        return None
+
+    # OID (contentType = signedData)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x06:
+        return None
+    pos += ln  # skip OID value
+
+    # [0] EXPLICIT
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0xA0:
+        return None
+
+    # SEQUENCE (SignedData)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x30:
+        return None
+
+    # INTEGER (version)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x02:
+        return None
+    pos += ln
+
+    # SET (digestAlgorithms)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x31:
+        return None
+    pos += ln
+
+    # SEQUENCE (encapContentInfo)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x30:
+        return None
+
+    # OID (contentType dentro encapContentInfo)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x06:
+        return None
+    pos += ln
+
+    # [0] EXPLICIT (eContent, opzionale)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0xA0:
+        return None
+
+    # OCTET STRING con il contenuto originale
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x04:
+        return None
+    return data[pos: pos + ln]
+
+
+# ─── Mapping formato -> estrattore ────────────────────────────────────────────
+
+_EXTRACTORS: dict[str, object] = {
+    # Documenti Office
+    "pdf":  _extract_pdf,
+    "docx": _extract_docx,
+    "doc":  _extract_docx,
+    "xlsx": _extract_xlsx,
+    "xls":  _extract_xlsx,
+    "pptx": _extract_pptx,
+    "ppt":  _extract_pptx,
+    # LibreOffice
+    "odt":  _extract_odt,
+    "ods":  _extract_odt,
+    "odp":  _extract_odt,
+    # Testo
+    "txt":  _extract_plain,
+    "csv":  _extract_plain,
+    "xml":  _extract_plain,
+    "html": _extract_plain,
+    "htm":  _extract_plain,
+    "json": _extract_plain,
+    # RTF
+    "rtf":  _extract_rtf,
+    # Email
+    "eml":  _extract_eml,
+    "msg":  _extract_eml,
+    # Firma digitale CAdES
+    "p7m":  _extract_p7m,
+    # Immagini (OCR)
+    "png":  _extract_image_ocr,
+    "jpg":  _extract_image_ocr,
+    "jpeg": _extract_image_ocr,
+    "tiff": _extract_image_ocr,
+    "tif":  _extract_image_ocr,
+    "bmp":  _extract_image_ocr,
+    "gif":  _extract_image_ocr,
+    "webp": _extract_image_ocr,
+}
+
+# Mapping content-type -> estensione normalizzata (per fallback quando il filename manca)
+_CONTENT_TYPE_MAP: dict[str, str] = {
+    "application/pdf": "pdf",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
+    "application/msword": "doc",
+    "application/vnd.ms-word": "doc",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
+    "application/vnd.ms-excel": "xls",
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
+    "application/vnd.ms-powerpoint": "ppt",
+    "application/vnd.oasis.opendocument.text": "odt",
+    "application/vnd.oasis.opendocument.spreadsheet": "ods",
+    "application/vnd.oasis.opendocument.presentation": "odp",
+    "application/rtf": "rtf",
+    "text/rtf": "rtf",
+    "text/plain": "txt",
+    "text/csv": "csv",
+    "text/xml": "xml",
+    "application/xml": "xml",
+    "text/html": "html",
+    "message/rfc822": "eml",
+    "application/pkcs7-mime": "p7m",
+    "application/x-pkcs7-mime": "p7m",
+    # Immagini (OCR)
+    "image/png":  "png",
+    "image/jpeg": "jpeg",
+    "image/jpg":  "jpeg",
+    "image/tiff": "tiff",
+    "image/bmp":  "bmp",
+    "image/gif":  "gif",
+    "image/webp": "webp",
+}


 # ─── Job principale ───────────────────────────────────────────────────────────
@@ -157,8 +642,13 @@ async def _do_index_message(
            attachment_texts.append(att.extracted_text)
            continue

-        # Controlla se e' un PDF o DOCX
-        if not (_is_pdf(att.content_type, att.filename) or _is_docx(att.content_type, att.filename)):
+        # Controlla se il formato e' supportato
+        extractor = _resolve_extractor(att.content_type, att.filename)
+        if extractor is None:
+            logger.debug(
+                f"Formato non supportato per indicizzazione: "
+                f"{att.filename!r} ({att.content_type!r})"
+            )
            continue

        # Scarica da MinIO
@@ -173,19 +663,29 @@ async def _do_index_message(
            )
            continue

-        # Estrai testo
-        if _is_pdf(att.content_type, att.filename):
-            extracted = _extract_pdf_text(content)
-        else:
-            extracted = _extract_docx_text(content)
-
-        if not extracted or not extracted.strip():
+        # Estrai testo - per p7m passa anche il filename originale
+        try:
+            e = _ext(att.filename)
+            if e == "p7m":
+                extracted = _extract_p7m(content, att.filename)
+            else:
+                extracted = extractor(content)  # type: ignore[operator]
+        except Exception as ex:
+            logger.debug(f"Errore estrazione {att.filename!r}: {ex}")
            continue

-        # Limita la dimensione e salva
+        if not extracted or not extracted.strip():
+            logger.debug(f"Nessun testo estratto da {att.filename!r}")
+            continue
+
+        # Limita la dimensione e salva sull'ORM object (col. mappata)
        att.extracted_text = extracted[:MAX_EXTRACTED_TEXT_LEN]
        attachment_texts.append(att.extracted_text)
        indexed_count += 1
+        logger.debug(
+            f"Testo estratto da {att.filename!r}: "
+            f"{len(att.extracted_text)} caratteri"
+        )

    # ── Aggiorna search_vector includendo il testo degli allegati ─────────────
    if attachment_texts:
@@ -214,5 +714,5 @@ async def _do_index_message(
        )
    else:
        logger.debug(
-            f"Messaggio {message_id}: nessun allegato PDF/DOCX con testo estraibile"
+            f"Messaggio {message_id}: nessun allegato con testo estraibile"
        )
@@ -194,6 +194,8 @@ class Attachment(Base):
    size_bytes: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
    storage_path: Mapped[str] = mapped_column(Text, nullable=False)
    checksum_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
+    # Testo estratto dall'indicizzatore full-text per la ricerca
+    extracted_text: Mapped[str | None] = mapped_column(Text, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now()
    )
@@ -116,7 +116,7 @@ def parse_date(date_str: str | None) -> datetime | None:
 # ─── Parser principale ────────────────────────────────────────────────────────


-def parse_eml(raw_bytes: bytes) -> ParsedEmail:
+def parse_eml(raw_bytes: bytes, is_receipt: bool = False) -> ParsedEmail:
    """
    Parsing completo di un raw EML.

@@ -126,7 +126,11 @@ def parse_eml(raw_bytes: bytes) -> ParsedEmail:
    - Allegati: tutti i parti con filename, inclusi message/rfc822

    Args:
-        raw_bytes: byte del messaggio EML grezzo
+        raw_bytes:  byte del messaggio EML grezzo
+        is_receipt: True se il messaggio e' una ricevuta PEC (accettazione,
+                    avvenuta_consegna, ecc.). In questo caso il body_text/html
+                    esterno (testo della ricevuta) non viene sovrascritto con
+                    il contenuto del messaggio annidato in postacert.eml.

    Returns:
        ParsedEmail con tutti i campi estratti (fields None/[] se non presenti)
@@ -153,7 +157,7 @@ def parse_eml(raw_bytes: bytes) -> ParsedEmail:

    # ── Body e allegati ───────────────────────────────────────────────────────
    if msg.is_multipart():
-        _walk_parts(msg, result)
+        _walk_parts(msg, result, is_receipt=is_receipt)
    else:
        _extract_single_part_body(msg, result)

@@ -208,7 +212,7 @@ def _get_filename(part: email.message.Message) -> str | None:
    return None


-def _walk_parts(msg: email.message.Message, result: ParsedEmail) -> None:
+def _walk_parts(msg: email.message.Message, result: ParsedEmail, is_receipt: bool = False) -> None:
    """
    Naviga ricorsivamente tutti i part MIME del messaggio.

@@ -230,7 +234,7 @@ def _walk_parts(msg: email.message.Message, result: ParsedEmail) -> None:

        # ── EML-in-EML (message/rfc822) ───────────────────────────────────────
        if ct == "message/rfc822":
-            _extract_eml_in_eml(part, filename, result)
+            _extract_eml_in_eml(part, filename, result, is_receipt=is_receipt)
            continue

        # ── Allegato esplicito (Content-Disposition: attachment) ──────────────
@@ -292,12 +296,16 @@ def _extract_eml_in_eml(
    part: email.message.Message,
    filename: str | None,
    result: ParsedEmail,
+    is_receipt: bool = False,
 ) -> None:
    """
    Estrae il messaggio EML annidato in un part message/rfc822.

-    Per postacert.eml (busta PEC in arrivo): ricorre dentro per estrarre
+    Per postacert.eml in messaggi posta_certificata: ricorre dentro per estrarre
    gli allegati utente e il corpo del messaggio originale del mittente.
+
+    Per le ricevute (is_receipt=True): estrae solo gli allegati utente senza
+    sovrascrivere il body gia' impostato (che e' il testo della ricevuta stessa).
    """
    try:
        payload = part.get_payload()
@@ -305,7 +313,7 @@ def _extract_eml_in_eml(
        inner_bytes: bytes | None = None

        if isinstance(payload, list) and payload:
-            # Forma canonica: payload è lista di Message
+            # Forma canonica: payload e' lista di Message
            inner_msg = payload[0]
            if isinstance(inner_msg, email.message.Message):
                inner_bytes = inner_msg.as_bytes()
@@ -330,19 +338,22 @@ def _extract_eml_in_eml(
            )
            result.attachments.append(att)

-            # Per postacert.eml: ricorre dentro per trovare allegati utente e corpo originale
+            # Per postacert.eml: ricorre dentro per trovare allegati utente
            if is_system and eff_filename.lower() == "postacert.eml":
                inner_parsed = parse_eml(inner_bytes)
                # Allegati non-sistema del messaggio originale del mittente
                for inner_att in inner_parsed.attachments:
                    if not inner_att.is_pec_system:
                        result.attachments.append(inner_att)
-                # Corpo del messaggio originale (più utile del testo della busta PEC)
-                if inner_parsed.body_html:
-                    result.body_html = inner_parsed.body_html
-                    result.body_text = inner_parsed.body_text
-                elif inner_parsed.body_text:
-                    result.body_text = inner_parsed.body_text
+                # Sovrascrive il corpo SOLO per messaggi posta_certificata (non ricevute).
+                # Per le ricevute il body esterno e' gia' il testo corretto della ricevuta;
+                # postacert.eml contiene il messaggio originale inviato che non va mostrato.
+                if not is_receipt:
+                    if inner_parsed.body_html:
+                        result.body_html = inner_parsed.body_html
+                        result.body_text = inner_parsed.body_text
+                    elif inner_parsed.body_text:
+                        result.body_text = inner_parsed.body_text

    except Exception as exc:
        logger.warning(f"Errore estrazione EML-in-EML: {exc}")
@@ -42,9 +42,18 @@ dependencies = [
    "python-dotenv>=1.0.0",
    "email-validator>=2.2.0",

-    # Full-text search: estrazione testo da allegati PDF e DOCX
+    # Full-text search: estrazione testo da allegati
    "pypdf>=4.0.0",
    "python-docx>=1.1.0",
+    "openpyxl>=3.1.0",
+    "python-pptx>=1.0.0",
+    "odfpy>=1.4.1",
+    "striprtf>=0.0.26",
+
+    # OCR per allegati image-only (immagini dirette e PDF scansionati)
+    "pytesseract>=0.3.13",
+    "pdf2image>=1.17.0",
+    "Pillow>=11.0.0",
 ]

 [project.optional-dependencies]
@@ -0,0 +1,129 @@
+"""
+Script one-shot: corregge il body_text/body_html delle ricevute PEC gia' in DB.
+
+Problema: il parser EML sovrascriveva il body delle ricevute con il contenuto
+di postacert.eml (messaggio originale inviato), invece di mostrare il testo
+della ricevuta stessa.
+
+Questo script:
+1. Trova tutti i messaggi in DB con pec_type di tipo ricevuta
+2. Scarica l'EML grezzo da MinIO (raw_eml_path)
+3. Lo ri-parsa con is_receipt=True (parser corretto)
+4. Aggiorna body_text e body_html nel DB
+
+Uso:
+    cd /opt/pechub
+    docker compose exec pechub-worker-1 python /app/scripts/fix_receipt_body.py
+"""
+
+import asyncio
+import logging
+import sys
+from datetime import UTC, datetime
+
+from sqlalchemy import select, update
+from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
+from sqlalchemy.orm import sessionmaker
+
+# Aggiungi il path dell'app
+sys.path.insert(0, "/app")
+
+from app.config import get_settings
+from app.models import Message
+from app.parsers.eml_parser import parse_eml
+from app.storage.minio_client import download_attachment
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Tipi di ricevuta che potrebbero avere il body sbagliato
+RECEIPT_TYPES = {
+    "accettazione",
+    "non_accettazione",
+    "presa_in_carico",
+    "avvenuta_consegna",
+    "mancata_consegna",
+    "errore_consegna",
+    "preavviso_mancata_consegna",
+    "rilevazione_virus",
+}
+
+
+async def fix_receipt_bodies() -> None:
+    settings = get_settings()
+
+    engine = create_async_engine(settings.database_url, echo=False)
+    async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
+
+    async with async_session() as db:
+        # Trova tutti i messaggi ricevuta con raw_eml_path
+        result = await db.execute(
+            select(Message).where(
+                Message.pec_type.in_(RECEIPT_TYPES),
+                Message.raw_eml_path.is_not(None),
+            ).order_by(Message.created_at)
+        )
+        messages = result.scalars().all()
+
+        logger.info(f"Trovate {len(messages)} ricevute da verificare")
+
+        fixed = 0
+        skipped = 0
+        errors = 0
+
+        for msg in messages:
+            try:
+                # Scarica EML grezzo da MinIO (download_attachment funziona per qualsiasi path)
+                raw_eml = await download_attachment(msg.raw_eml_path)
+                if not raw_eml:
+                    logger.warning(f"EML non trovato su MinIO per messaggio {msg.id} (path={msg.raw_eml_path!r})")
+                    skipped += 1
+                    continue
+
+                # Re-parsing con is_receipt=True (parser corretto)
+                parsed = parse_eml(raw_eml, is_receipt=True)
+
+                # Controlla se il body e' cambiato
+                new_body_text = parsed.body_text
+                new_body_html = parsed.body_html
+
+                if new_body_text == msg.body_text and new_body_html == msg.body_html:
+                    logger.debug(f"Messaggio {msg.id} ({msg.pec_type}): body invariato, skip")
+                    skipped += 1
+                    continue
+
+                # Aggiorna nel DB
+                msg.body_text = new_body_text
+                msg.body_html = new_body_html
+                msg.updated_at = datetime.now(UTC)
+
+                logger.info(
+                    f"Fixato: id={msg.id} pec_type={msg.pec_type!r} subject={msg.subject!r} "
+                    f"body_text_len={len(new_body_text or '')}"
+                )
+                fixed += 1
+
+            except Exception as e:
+                logger.error(f"Errore su messaggio {msg.id}: {e}", exc_info=True)
+                errors += 1
+                continue
+
+        if fixed > 0:
+            await db.commit()
+            logger.info(f"Commit eseguito: {fixed} messaggi aggiornati")
+        else:
+            logger.info("Nessun messaggio da aggiornare")
+
+        logger.info(
+            f"Completato: fixed={fixed} skipped={skipped} errors={errors} "
+            f"totale={len(messages)}"
+        )
+
+    await engine.dispose()
+
+
+if __name__ == "__main__":
+    asyncio.run(fix_receipt_bodies())