OCR + reportistica

2026-06-17 13:15:42 +02:00 · 2026-03-27 13:54:07 +01:00
parent cbeedc2d2f
commit bb2060c1ae
26 changed files with 5503 additions and 237 deletions
@@ -2,17 +2,29 @@
 Indicizzazione full-text dei messaggi PEC.

 Responsabilita':
-  1. Scarica gli allegati PDF e DOCX da MinIO
-  2. Estrae il testo con pypdf (PDF) e python-docx (DOCX)
+  1. Scarica gli allegati da MinIO
+  2. Estrae il testo in base al formato del file
  3. Aggiorna la colonna extracted_text in attachments
  4. Aggiorna la colonna search_vector in messages includendo il testo degli allegati

+Formati supportati:
+  - PDF       (.pdf)           tramite pypdf
+  - Word      (.docx, .doc)    tramite python-docx
+  - Excel     (.xlsx, .xls)    tramite openpyxl
+  - PowerPoint(.pptx, .ppt)    tramite python-pptx
+  - LibreOffice (.odt, .ods, .odp) tramite odfpy
+  - RTF       (.rtf)           tramite striprtf
+  - Testo     (.txt, .csv, .xml, .html, .htm) testo grezzo
+  - Email     (.eml, .msg)     tramite stdlib email
+  - Firmati   (.p7m)           unwrap CMS poi estrae in base all'estensione interna
+
 Viene chiamato alla fine di _save_message in sync.py, in modo non bloccante:
 un'eccezione qui non interrompe la sincronizzazione del messaggio.
 """

 import io
 import logging
+import re
 import uuid

 from sqlalchemy import select, text
@@ -26,61 +38,534 @@ MAX_EXTRACTED_TEXT_LEN = 50_000
 MAX_COMBINED_TEXT_LEN = 200_000


-# ─── Estrazione testo ─────────────────────────────────────────────────────────
+# ─── Rilevamento tipo file ────────────────────────────────────────────────────

-def _extract_pdf_text(content: bytes) -> str:
-    """Estrae testo da un PDF usando pypdf."""
+def _ext(filename: str | None) -> str:
+    """Restituisce l'estensione del file in minuscolo, senza punto."""
+    if not filename:
+        return ""
+    fn = filename.lower()
+    # Gestione doppia estensione es. documento.pdf.p7m
+    if fn.endswith(".p7m"):
+        return "p7m"
+    idx = fn.rfind(".")
+    return fn[idx + 1:] if idx >= 0 else ""
+
+
+def _is_extractable(content_type: str | None, filename: str | None) -> bool:
+    """Ritorna True se il formato e' supportato dall'estrattore."""
+    ct = (content_type or "").lower()
+    e = _ext(filename)
+    return e in _EXTRACTORS or ct in _CONTENT_TYPE_MAP
+
+
+def _resolve_extractor(content_type: str | None, filename: str | None):
+    """Ritorna la funzione estrattore appropriata, o None."""
+    e = _ext(filename)
+    if e in _EXTRACTORS:
+        return _EXTRACTORS[e]
+    ct = (content_type or "").lower()
+    if ct in _CONTENT_TYPE_MAP:
+        return _EXTRACTORS.get(_CONTENT_TYPE_MAP[ct])
+    return None
+
+
+# ─── Estrattori ───────────────────────────────────────────────────────────────
+
+# Soglia minima di caratteri estratti da pypdf prima di ricorrere all'OCR.
+# Un PDF di testo reale produce migliaia di caratteri; una scansione ne produce
+# zero o pochissimi (artefatti). 50 char e' un valore conservativo sicuro.
+_PDF_OCR_THRESHOLD = 50
+
+# Numero massimo di pagine su cui eseguire OCR per evitare timeout su PDF lunghi.
+_PDF_OCR_MAX_PAGES = 15
+
+
+def _extract_pdf(content: bytes) -> str:
+    """
+    Estrae testo da PDF tramite pypdf.
+
+    Se il testo estratto e' inferiore a _PDF_OCR_THRESHOLD caratteri (PDF
+    image-only / scansione), attiva il fallback OCR via Tesseract.
+    """
    try:
        import pypdf  # type: ignore[import]
-
        reader = pypdf.PdfReader(io.BytesIO(content))
        parts: list[str] = []
        for page in reader.pages:
            try:
-                txt = page.extract_text()
-                if txt:
-                    parts.append(txt)
+                t = page.extract_text()
+                if t:
+                    parts.append(t)
            except Exception:
                continue
-        return " ".join(parts)
+        text = " ".join(parts)
    except ImportError:
        logger.warning("pypdf non installato: impossibile estrarre testo da PDF")
        return ""
    except Exception as e:
-        logger.debug(f"Errore estrazione testo PDF: {e}")
+        logger.debug(f"Errore estrazione PDF: {e}")
+        return ""
+
+    # Se il testo e' troppo corto, il PDF e' probabilmente una scansione
+    if len(text.strip()) < _PDF_OCR_THRESHOLD:
+        logger.debug(
+            f"PDF con testo insufficiente ({len(text.strip())} char), "
+            "tentativo OCR..."
+        )
+        ocr_text = _extract_pdf_ocr(content)
+        if len(ocr_text.strip()) > len(text.strip()):
+            return ocr_text
+
+    return text
+
+
+def _extract_pdf_ocr(content: bytes) -> str:
+    """
+    OCR su PDF image-only tramite pdf2image + Tesseract.
+
+    Converte le pagine del PDF in immagini PIL a 200 DPI (buon compromesso
+    qualita'/velocita' su CPU) e applica Tesseract con lingua italiana + inglese.
+    Processa al massimo _PDF_OCR_MAX_PAGES pagine per evitare timeout.
+    """
+    try:
+        from pdf2image import convert_from_bytes  # type: ignore[import]
+        import pytesseract  # type: ignore[import]
+
+        pages = convert_from_bytes(
+            content,
+            dpi=200,
+            last_page=_PDF_OCR_MAX_PAGES,
+        )
+        parts: list[str] = []
+        for page_img in pages:
+            try:
+                t = pytesseract.image_to_string(page_img, lang="ita+eng")
+                if t and t.strip():
+                    parts.append(t.strip())
+            except Exception:
+                continue
+        return " ".join(parts)
+    except ImportError:
+        logger.warning(
+            "pdf2image o pytesseract non installati: impossibile OCR PDF"
+        )
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore OCR PDF: {e}")
        return ""


-def _extract_docx_text(content: bytes) -> str:
-    """Estrae testo da un DOCX usando python-docx."""
+def _extract_image_ocr(content: bytes) -> str:
+    """
+    Estrae testo da un file immagine (PNG, JPEG, TIFF, BMP, ecc.) tramite OCR.
+
+    Usa Tesseract con lingua italiana + inglese per massima copertura
+    su documenti italiani.
+    """
+    try:
+        import pytesseract  # type: ignore[import]
+        from PIL import Image  # type: ignore[import]
+
+        img = Image.open(io.BytesIO(content))
+        # Converti in RGB se necessario (TIFF multi-frame, palette, ecc.)
+        if img.mode not in ("RGB", "L"):
+            img = img.convert("RGB")
+        text = pytesseract.image_to_string(img, lang="ita+eng")
+        return " ".join(text.split())
+    except ImportError:
+        logger.warning(
+            "pytesseract o Pillow non installati: impossibile OCR immagine"
+        )
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore OCR immagine: {e}")
+        return ""
+
+
+def _extract_docx(content: bytes) -> str:
+    """Estrae testo da DOCX/DOC tramite python-docx."""
    try:
        import docx  # type: ignore[import]
-
        doc = docx.Document(io.BytesIO(content))
-        parts = [para.text for para in doc.paragraphs if para.text and para.text.strip()]
+        parts = [p.text for p in doc.paragraphs if p.text and p.text.strip()]
+        # Include anche le tabelle
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    if cell.text and cell.text.strip():
+                        parts.append(cell.text.strip())
        return " ".join(parts)
    except ImportError:
        logger.warning("python-docx non installato: impossibile estrarre testo da DOCX")
        return ""
    except Exception as e:
-        logger.debug(f"Errore estrazione testo DOCX: {e}")
+        logger.debug(f"Errore estrazione DOCX: {e}")
        return ""


-def _is_pdf(content_type: str | None, filename: str | None) -> bool:
-    ct = (content_type or "").lower()
-    fn = (filename or "").lower()
-    return ct == "application/pdf" or fn.endswith(".pdf")
+def _extract_xlsx(content: bytes) -> str:
+    """Estrae testo da XLSX/XLS tramite openpyxl."""
+    try:
+        import openpyxl  # type: ignore[import]
+        wb = openpyxl.load_workbook(io.BytesIO(content), read_only=True, data_only=True)
+        parts: list[str] = []
+        for ws in wb.worksheets:
+            for row in ws.iter_rows():
+                for cell in row:
+                    if cell.value is not None:
+                        v = str(cell.value).strip()
+                        if v:
+                            parts.append(v)
+        return " ".join(parts)
+    except ImportError:
+        logger.warning("openpyxl non installato: impossibile estrarre testo da XLSX")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione XLSX: {e}")
+        return ""


-def _is_docx(content_type: str | None, filename: str | None) -> bool:
-    ct = (content_type or "").lower()
-    fn = (filename or "").lower()
-    return ct in (
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/msword",
-        "application/vnd.ms-word",
-    ) or fn.endswith((".docx", ".doc"))
+def _extract_pptx(content: bytes) -> str:
+    """Estrae testo da PPTX/PPT tramite python-pptx."""
+    try:
+        from pptx import Presentation  # type: ignore[import]
+        prs = Presentation(io.BytesIO(content))
+        parts: list[str] = []
+        for slide in prs.slides:
+            for shape in slide.shapes:
+                if shape.has_text_frame:
+                    for para in shape.text_frame.paragraphs:
+                        t = para.text.strip()
+                        if t:
+                            parts.append(t)
+        return " ".join(parts)
+    except ImportError:
+        logger.warning("python-pptx non installato: impossibile estrarre testo da PPTX")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione PPTX: {e}")
+        return ""
+
+
+def _extract_odt(content: bytes) -> str:
+    """Estrae testo da ODT/ODS/ODP tramite odfpy."""
+    try:
+        from odf import opendocument, teletype  # type: ignore[import]
+        from odf.text import P  # type: ignore[import]
+        doc = opendocument.load(io.BytesIO(content))
+        parts: list[str] = []
+        for el in doc.body.getElementsByType(P):
+            t = teletype.extractText(el).strip()
+            if t:
+                parts.append(t)
+        return " ".join(parts)
+    except ImportError:
+        logger.warning("odfpy non installato: impossibile estrarre testo da ODT")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione ODT: {e}")
+        return ""
+
+
+def _extract_rtf(content: bytes) -> str:
+    """Estrae testo da RTF tramite striprtf."""
+    try:
+        from striprtf.striprtf import rtf_to_text  # type: ignore[import]
+        raw = content.decode("latin-1", errors="replace")
+        return rtf_to_text(raw)
+    except ImportError:
+        logger.warning("striprtf non installato: impossibile estrarre testo da RTF")
+        return ""
+    except Exception as e:
+        logger.debug(f"Errore estrazione RTF: {e}")
+        return ""
+
+
+def _extract_plain(content: bytes) -> str:
+    """Estrae testo da file di testo puro (txt, csv, xml, html, ecc.)."""
+    try:
+        # Prova UTF-8 prima, poi latin-1 come fallback
+        try:
+            text = content.decode("utf-8")
+        except UnicodeDecodeError:
+            text = content.decode("latin-1", errors="replace")
+        # Per XML/HTML: rimuove i tag
+        if "<" in text and ">" in text:
+            text = re.sub(r"<[^>]+>", " ", text)
+            text = re.sub(r"&[a-zA-Z]+;", " ", text)
+        return " ".join(text.split())
+    except Exception as e:
+        logger.debug(f"Errore estrazione testo plain: {e}")
+        return ""
+
+
+def _extract_eml(content: bytes) -> str:
+    """Estrae testo da un file EML allegato."""
+    try:
+        import email as emaillib
+        msg = emaillib.message_from_bytes(content)
+        parts: list[str] = []
+        subject = msg.get("Subject", "")
+        if subject:
+            parts.append(subject)
+        sender = msg.get("From", "")
+        if sender:
+            parts.append(sender)
+        # Estrae body
+        if msg.is_multipart():
+            for part in msg.walk():
+                ct = part.get_content_type()
+                if ct == "text/plain":
+                    try:
+                        payload = part.get_payload(decode=True)
+                        if payload:
+                            charset = part.get_content_charset() or "utf-8"
+                            parts.append(payload.decode(charset, errors="replace"))
+                    except Exception:
+                        pass
+        else:
+            payload = msg.get_payload(decode=True)
+            if payload:
+                charset = msg.get_content_charset() or "utf-8"
+                parts.append(payload.decode(charset, errors="replace"))  # type: ignore[arg-type]
+        return " ".join(parts)
+    except Exception as e:
+        logger.debug(f"Errore estrazione EML: {e}")
+        return ""
+
+
+def _extract_p7m(content: bytes, original_filename: str | None = None) -> str:
+    """
+    Estrae testo da un documento con firma digitale CAdES (.p7m).
+
+    Prova a fare l'unwrap del CMS envelope tramite la libreria cryptography
+    (gia' presente nel worker). Se l'unwrap ha successo, determina il formato
+    del documento interno dall'estensione del nome originale (es. fattura.pdf.p7m
+    -> PDF) e applica l'estrattore appropriato.
+    """
+    inner_content: bytes | None = None
+
+    # Metodo 1: cryptography (CMS/PKCS7)
+    try:
+        from cryptography.hazmat.primitives.serialization import pkcs7  # type: ignore[import]
+        # load_pem_pkcs7_certificates / load_der_pkcs7_certificates non espongono il payload
+        # Usiamo il modulo backend direttamente
+        from cryptography.hazmat.backends import default_backend
+        from cryptography.hazmat.primitives.asymmetric import padding as asym_padding
+        from cryptography.x509 import load_der_x509_certificate  # noqa: F401
+
+        # Prova parsing DER diretto della struttura CMS ContentInfo
+        # La struttura ASN.1 di SignedData contiene encapContentInfo -> eContent
+        from cryptography.hazmat.bindings._rust import (  # type: ignore[import]
+            x509 as rust_x509,
+        )
+        _ = rust_x509  # solo per verificare import
+    except Exception:
+        pass
+
+    # Metodo piu' semplice: parsing ASN.1 manuale per estrarre eContent
+    # La struttura DER di CMS SignedData:
+    # SEQUENCE {
+    #   OID (signedData)
+    #   [0] EXPLICIT SEQUENCE {
+    #     INTEGER (version)
+    #     SET (digestAlgorithms)
+    #     SEQUENCE (encapContentInfo) {
+    #       OID (contentType = data)
+    #       [0] EXPLICIT OCTET STRING (eContent) <- questo e' il contenuto originale
+    #     }
+    #     ...
+    #   }
+    # }
+    try:
+        inner_content = _unwrap_p7m_asn1(content)
+    except Exception as e:
+        logger.debug(f"Unwrap P7M ASN1 fallito: {e}")
+
+    if not inner_content:
+        logger.debug("Impossibile estrarre contenuto dal file .p7m")
+        return ""
+
+    # Determina l'estensione interna dal nome file originale
+    # es. "fattura.pdf.p7m" -> inner ext = "pdf"
+    inner_ext = ""
+    if original_filename:
+        fn = original_filename.lower()
+        if fn.endswith(".p7m"):
+            fn = fn[:-4]  # rimuove .p7m
+        idx = fn.rfind(".")
+        if idx >= 0:
+            inner_ext = fn[idx + 1:]
+
+    extractor = _EXTRACTORS.get(inner_ext)
+    if extractor:
+        logger.debug(f"P7M: estrazione interna come {inner_ext!r}")
+        return extractor(inner_content)
+
+    # Fallback: prova a riconoscere il formato dall'header del contenuto
+    if inner_content[:4] == b"%PDF":
+        return _extract_pdf(inner_content)
+    if inner_content[:2] in (b"PK",):  # ZIP-based (docx, xlsx, pptx, odt)
+        # Prova nell'ordine piu' comune
+        for fn in (_extract_docx, _extract_xlsx, _extract_pptx, _extract_odt):
+            result = fn(inner_content)
+            if result.strip():
+                return result
+    # Ultimo tentativo: plain text
+    return _extract_plain(inner_content)
+
+
+def _unwrap_p7m_asn1(data: bytes) -> bytes | None:
+    """
+    Parsing ASN.1 DER minimale per estrarre eContent da una struttura CMS SignedData.
+    Non verifica la firma: serve solo per l'estrazione del testo.
+    """
+    pos = 0
+    length = len(data)
+
+    def read_tag_length(buf: bytes, offset: int) -> tuple[int, int, int]:
+        """Ritorna (tag, length, new_offset)."""
+        tag = buf[offset]
+        offset += 1
+        lb = buf[offset]
+        offset += 1
+        if lb & 0x80:
+            num_bytes = lb & 0x7F
+            ln = int.from_bytes(buf[offset:offset + num_bytes], "big")
+            offset += num_bytes
+        else:
+            ln = lb
+        return tag, ln, offset
+
+    # outer SEQUENCE
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x30:
+        return None
+
+    # OID (contentType = signedData)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x06:
+        return None
+    pos += ln  # skip OID value
+
+    # [0] EXPLICIT
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0xA0:
+        return None
+
+    # SEQUENCE (SignedData)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x30:
+        return None
+
+    # INTEGER (version)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x02:
+        return None
+    pos += ln
+
+    # SET (digestAlgorithms)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x31:
+        return None
+    pos += ln
+
+    # SEQUENCE (encapContentInfo)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x30:
+        return None
+
+    # OID (contentType dentro encapContentInfo)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x06:
+        return None
+    pos += ln
+
+    # [0] EXPLICIT (eContent, opzionale)
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0xA0:
+        return None
+
+    # OCTET STRING con il contenuto originale
+    tag, ln, pos = read_tag_length(data, pos)
+    if tag != 0x04:
+        return None
+    return data[pos: pos + ln]
+
+
+# ─── Mapping formato -> estrattore ────────────────────────────────────────────
+
+_EXTRACTORS: dict[str, object] = {
+    # Documenti Office
+    "pdf":  _extract_pdf,
+    "docx": _extract_docx,
+    "doc":  _extract_docx,
+    "xlsx": _extract_xlsx,
+    "xls":  _extract_xlsx,
+    "pptx": _extract_pptx,
+    "ppt":  _extract_pptx,
+    # LibreOffice
+    "odt":  _extract_odt,
+    "ods":  _extract_odt,
+    "odp":  _extract_odt,
+    # Testo
+    "txt":  _extract_plain,
+    "csv":  _extract_plain,
+    "xml":  _extract_plain,
+    "html": _extract_plain,
+    "htm":  _extract_plain,
+    "json": _extract_plain,
+    # RTF
+    "rtf":  _extract_rtf,
+    # Email
+    "eml":  _extract_eml,
+    "msg":  _extract_eml,
+    # Firma digitale CAdES
+    "p7m":  _extract_p7m,
+    # Immagini (OCR)
+    "png":  _extract_image_ocr,
+    "jpg":  _extract_image_ocr,
+    "jpeg": _extract_image_ocr,
+    "tiff": _extract_image_ocr,
+    "tif":  _extract_image_ocr,
+    "bmp":  _extract_image_ocr,
+    "gif":  _extract_image_ocr,
+    "webp": _extract_image_ocr,
+}
+
+# Mapping content-type -> estensione normalizzata (per fallback quando il filename manca)
+_CONTENT_TYPE_MAP: dict[str, str] = {
+    "application/pdf": "pdf",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
+    "application/msword": "doc",
+    "application/vnd.ms-word": "doc",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
+    "application/vnd.ms-excel": "xls",
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
+    "application/vnd.ms-powerpoint": "ppt",
+    "application/vnd.oasis.opendocument.text": "odt",
+    "application/vnd.oasis.opendocument.spreadsheet": "ods",
+    "application/vnd.oasis.opendocument.presentation": "odp",
+    "application/rtf": "rtf",
+    "text/rtf": "rtf",
+    "text/plain": "txt",
+    "text/csv": "csv",
+    "text/xml": "xml",
+    "application/xml": "xml",
+    "text/html": "html",
+    "message/rfc822": "eml",
+    "application/pkcs7-mime": "p7m",
+    "application/x-pkcs7-mime": "p7m",
+    # Immagini (OCR)
+    "image/png":  "png",
+    "image/jpeg": "jpeg",
+    "image/jpg":  "jpeg",
+    "image/tiff": "tiff",
+    "image/bmp":  "bmp",
+    "image/gif":  "gif",
+    "image/webp": "webp",
+}


 # ─── Job principale ───────────────────────────────────────────────────────────
@@ -157,8 +642,13 @@ async def _do_index_message(
            attachment_texts.append(att.extracted_text)
            continue

-        # Controlla se e' un PDF o DOCX
-        if not (_is_pdf(att.content_type, att.filename) or _is_docx(att.content_type, att.filename)):
+        # Controlla se il formato e' supportato
+        extractor = _resolve_extractor(att.content_type, att.filename)
+        if extractor is None:
+            logger.debug(
+                f"Formato non supportato per indicizzazione: "
+                f"{att.filename!r} ({att.content_type!r})"
+            )
            continue

        # Scarica da MinIO
@@ -173,19 +663,29 @@ async def _do_index_message(
            )
            continue

-        # Estrai testo
-        if _is_pdf(att.content_type, att.filename):
-            extracted = _extract_pdf_text(content)
-        else:
-            extracted = _extract_docx_text(content)
-
-        if not extracted or not extracted.strip():
+        # Estrai testo - per p7m passa anche il filename originale
+        try:
+            e = _ext(att.filename)
+            if e == "p7m":
+                extracted = _extract_p7m(content, att.filename)
+            else:
+                extracted = extractor(content)  # type: ignore[operator]
+        except Exception as ex:
+            logger.debug(f"Errore estrazione {att.filename!r}: {ex}")
            continue

-        # Limita la dimensione e salva
+        if not extracted or not extracted.strip():
+            logger.debug(f"Nessun testo estratto da {att.filename!r}")
+            continue
+
+        # Limita la dimensione e salva sull'ORM object (col. mappata)
        att.extracted_text = extracted[:MAX_EXTRACTED_TEXT_LEN]
        attachment_texts.append(att.extracted_text)
        indexed_count += 1
+        logger.debug(
+            f"Testo estratto da {att.filename!r}: "
+            f"{len(att.extracted_text)} caratteri"
+        )

    # ── Aggiorna search_vector includendo il testo degli allegati ─────────────
    if attachment_texts:
@@ -214,5 +714,5 @@ async def _do_index_message(
        )
    else:
        logger.debug(
-            f"Messaggio {message_id}: nessun allegato PDF/DOCX con testo estraibile"
+            f"Messaggio {message_id}: nessun allegato con testo estraibile"
        )