PecHub/backend/app/services/indexing_service.py

"""
Servizio di gestione indicizzazione full-text dei messaggi.

Funzionalita':
  - Statistiche sull'indicizzazione (messaggi indicizzati vs totali)
  - Avvio reindex totale o differenziale in background
  - Monitoraggio progresso tramite Redis
  - Cancellazione di un job in corso
  - Avvio rescan allegati: ri-estrazione testo da MinIO + aggiornamento search_vector

Stato del job reindex salvato in Redis:
  pechub:reindex:{tenant_id}         -> JSON con stato corrente (TTL 24h)
  pechub:reindex:{tenant_id}:cancel  -> flag cancellazione (TTL 10min)

Stato del job rescan salvato in Redis:
  pechub:rescan:{tenant_id}          -> JSON con stato corrente (TTL 24h)
  pechub:rescan:{tenant_id}:cancel   -> flag cancellazione (TTL 10min)

Il background task usa una sessione DB propria (non quella della request).
"""

import asyncio
import io
import json
import logging
import re
import uuid
from datetime import datetime, timezone
from typing import Literal, Optional

from sqlalchemy import func, select, text
from sqlalchemy.ext.asyncio import AsyncSession

from app.core.logging import get_logger

logger = get_logger(__name__)

# ─── Costanti ─────────────────────────────────────────────────────────────────

REDIS_KEY_PREFIX = "pechub:reindex"
REDIS_RESCAN_PREFIX = "pechub:rescan"
REDIS_TTL_STATUS = 60 * 60 * 24        # 24 ore
REDIS_TTL_CANCEL = 60 * 10             # 10 minuti
BATCH_SIZE = 500                        # messaggi per batch (reindex)
RESCAN_BATCH_SIZE = 50                  # allegati per batch (rescan - piu' pesante)
STALE_THRESHOLD_HOURS = 2              # ore prima di segnalare un job come stale

MAX_EXTRACTED_TEXT_LEN = 50_000
MAX_COMBINED_TEXT_LEN = 200_000

ReindexMode = Literal["full", "differential"]
JobStatus = Literal["idle", "running", "completed", "failed", "cancelled"]

# ─── Content-type e estensioni supportate per rescan ─────────────────────────

_SUPPORTED_EXTENSIONS = {
    "pdf", "docx", "doc", "xlsx", "xls", "pptx", "ppt",
    "odt", "ods", "odp", "rtf", "txt", "csv", "xml",
    "html", "htm", "json", "eml", "msg", "p7m",
    "md",
    # Immagini (OCR)
    "png", "jpg", "jpeg", "tiff", "tif", "bmp", "gif", "webp",
}

_SUPPORTED_CONTENT_TYPES = {
    "application/pdf",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "application/msword",
    "application/vnd.ms-word",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    "application/vnd.ms-excel",
    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    "application/vnd.ms-powerpoint",
    "application/vnd.oasis.opendocument.text",
    "application/vnd.oasis.opendocument.spreadsheet",
    "application/vnd.oasis.opendocument.presentation",
    "application/rtf",
    "text/rtf",
    "text/plain",
    "text/csv",
    "text/xml",
    "application/xml",
    "text/html",
    "message/rfc822",
    "application/pkcs7-mime",
    "application/x-pkcs7-mime",
    "text/markdown",
    # Immagini (OCR)
    "image/png",
    "image/jpeg",
    "image/tiff",
    "image/bmp",
    "image/gif",
    "image/webp",
}


# ─── Helpers Redis ─────────────────────────────────────────────────────────────

def _redis_key(tenant_id: uuid.UUID) -> str:
    return f"{REDIS_KEY_PREFIX}:{tenant_id}"


def _redis_cancel_key(tenant_id: uuid.UUID) -> str:
    return f"{REDIS_KEY_PREFIX}:{tenant_id}:cancel"


def _redis_rescan_key(tenant_id: uuid.UUID) -> str:
    return f"{REDIS_RESCAN_PREFIX}:{tenant_id}"


def _redis_rescan_cancel_key(tenant_id: uuid.UUID) -> str:
    return f"{REDIS_RESCAN_PREFIX}:{tenant_id}:cancel"


# ─── Estrattori testo allegati ─────────────────────────────────────────────────

def _ext(filename: str | None) -> str:
    """Restituisce l'estensione del file in minuscolo, senza punto."""
    if not filename:
        return ""
    fn = filename.lower()
    if fn.endswith(".p7m"):
        return "p7m"
    idx = fn.rfind(".")
    return fn[idx + 1:] if idx >= 0 else ""


# Soglia minima di caratteri estratti da pypdf prima di ricorrere all'OCR.
_PDF_OCR_THRESHOLD = 50
# Numero massimo di pagine OCR per evitare timeout su PDF lunghi.
_PDF_OCR_MAX_PAGES = 15


def _extract_pdf(content: bytes) -> str:
    """
    Estrae testo da PDF tramite pypdf.

    Se il testo estratto e' inferiore a _PDF_OCR_THRESHOLD caratteri (PDF
    image-only / scansione), attiva il fallback OCR via Tesseract.
    """
    try:
        import pypdf  # type: ignore[import]
        reader = pypdf.PdfReader(io.BytesIO(content))
        parts: list[str] = []
        for page in reader.pages:
            try:
                t = page.extract_text()
                if t:
                    parts.append(t)
            except Exception:
                continue
        text = " ".join(parts)
    except ImportError:
        logger.warning("pypdf non installato: impossibile estrarre testo da PDF")
        return ""
    except Exception as e:
        logger.debug(f"Errore estrazione PDF: {e}")
        return ""

    if len(text.strip()) < _PDF_OCR_THRESHOLD:
        logger.debug(
            f"PDF con testo insufficiente ({len(text.strip())} char), "
            "tentativo OCR..."
        )
        ocr_text = _extract_pdf_ocr(content)
        if len(ocr_text.strip()) > len(text.strip()):
            return ocr_text

    return text


def _extract_pdf_ocr(content: bytes) -> str:
    """
    OCR su PDF image-only tramite pdf2image + Tesseract.

    Converte le pagine a 200 DPI e applica Tesseract con lingua italiana + inglese.
    Processa al massimo _PDF_OCR_MAX_PAGES pagine per evitare timeout.
    """
    try:
        from pdf2image import convert_from_bytes  # type: ignore[import]
        import pytesseract  # type: ignore[import]

        pages = convert_from_bytes(
            content,
            dpi=200,
            last_page=_PDF_OCR_MAX_PAGES,
        )
        parts: list[str] = []
        for page_img in pages:
            try:
                t = pytesseract.image_to_string(page_img, lang="ita+eng")
                if t and t.strip():
                    parts.append(t.strip())
            except Exception:
                continue
        return " ".join(parts)
    except ImportError:
        logger.warning(
            "pdf2image o pytesseract non installati: impossibile OCR PDF"
        )
        return ""
    except Exception as e:
        logger.debug(f"Errore OCR PDF: {e}")
        return ""


def _extract_image_ocr(content: bytes) -> str:
    """
    Estrae testo da un file immagine (PNG, JPEG, TIFF, BMP, ecc.) tramite OCR.

    Usa Tesseract con lingua italiana + inglese per massima copertura
    su documenti italiani.
    """
    try:
        import pytesseract  # type: ignore[import]
        from PIL import Image  # type: ignore[import]

        img = Image.open(io.BytesIO(content))
        if img.mode not in ("RGB", "L"):
            img = img.convert("RGB")
        text = pytesseract.image_to_string(img, lang="ita+eng")
        return " ".join(text.split())
    except ImportError:
        logger.warning(
            "pytesseract o Pillow non installati: impossibile OCR immagine"
        )
        return ""
    except Exception as e:
        logger.debug(f"Errore OCR immagine: {e}")
        return ""


def _extract_docx(content: bytes) -> str:
    try:
        import docx  # type: ignore[import]
        doc = docx.Document(io.BytesIO(content))
        parts = [p.text for p in doc.paragraphs if p.text and p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text and cell.text.strip():
                        parts.append(cell.text.strip())
        return " ".join(parts)
    except ImportError:
        logger.warning("python-docx non installato: impossibile estrarre testo da DOCX")
        return ""
    except Exception as e:
        logger.debug(f"Errore estrazione DOCX: {e}")
        return ""


def _extract_doc(content: bytes) -> str:
    """
    Estrae testo da file DOC (formato OLE2/legacy Microsoft Word).

    Prima tenta python-docx (gestisce .docx eventualmente rinominati come .doc).
    Se fallisce, esegue uno scan del binario OLE2 estraendo sequenze di caratteri
    stampabili di almeno 5 caratteri consecutivi (approccio 'strings').
    Non richiede librerie aggiuntive e funziona per la maggior parte dei .doc
    in lingua italiana/europea (ASCII + Latin-1).
    """
    # Tentativo 1: python-docx (per .docx rinominati o ZIP-based)
    result = _extract_docx(content)
    if result.strip():
        return result

    # Tentativo 2: scan binario ASCII per OLE2
    try:
        run: list[int] = []
        parts: list[str] = []
        for byte in content:
            if 32 <= byte <= 126:   # ASCII stampabile
                run.append(byte)
            else:
                if len(run) >= 5:
                    parts.append(bytes(run).decode("ascii", errors="ignore"))
                run = []
        if len(run) >= 5:
            parts.append(bytes(run).decode("ascii", errors="ignore"))

        # Mantieni solo sequenze con almeno 3 lettere (filtra sequenze di simboli)
        meaningful = [p for p in parts if sum(1 for c in p if c.isalpha()) >= 3]
        if meaningful:
            text = " ".join(meaningful)
            return " ".join(text.split())[:MAX_EXTRACTED_TEXT_LEN]
    except Exception as e:
        logger.debug(f"Errore estrazione DOC OLE2 binario: {e}")

    return ""


def _extract_plain(content: bytes) -> str:
    try:
        try:
            txt = content.decode("utf-8")
        except UnicodeDecodeError:
            txt = content.decode("latin-1", errors="replace")
        if "<" in txt and ">" in txt:
            txt = re.sub(r"<[^>]+>", " ", txt)
            txt = re.sub(r"&[a-zA-Z]+;", " ", txt)
        return " ".join(txt.split())
    except Exception as e:
        logger.debug(f"Errore estrazione plain: {e}")
        return ""


def _extract_eml(content: bytes) -> str:
    try:
        import email as emaillib
        msg = emaillib.message_from_bytes(content)
        parts: list[str] = []
        subject = msg.get("Subject", "")
        if subject:
            parts.append(subject)
        sender = msg.get("From", "")
        if sender:
            parts.append(sender)
        if msg.is_multipart():
            for part in msg.walk():
                ct = part.get_content_type()
                if ct == "text/plain":
                    try:
                        payload = part.get_payload(decode=True)
                        if payload:
                            charset = part.get_content_charset() or "utf-8"
                            parts.append(payload.decode(charset, errors="replace"))
                    except Exception:
                        pass
        else:
            payload = msg.get_payload(decode=True)
            if payload:
                charset = msg.get_content_charset() or "utf-8"
                parts.append(payload.decode(charset, errors="replace"))  # type: ignore[arg-type]
        return " ".join(parts)
    except Exception as e:
        logger.debug(f"Errore estrazione EML: {e}")
        return ""


def _unwrap_p7m_asn1(data: bytes) -> bytes | None:
    def read_tag_length(buf: bytes, offset: int):
        tag = buf[offset]
        offset += 1
        lb = buf[offset]
        offset += 1
        if lb & 0x80:
            num_bytes = lb & 0x7F
            ln = int.from_bytes(buf[offset:offset + num_bytes], "big")
            offset += num_bytes
        else:
            ln = lb
        return tag, ln, offset

    pos = 0
    try:
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0x30:
            return None
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0x06:
            return None
        pos += ln
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0xA0:
            return None
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0x30:
            return None
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0x02:
            return None
        pos += ln
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0x31:
            return None
        pos += ln
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0x30:
            return None
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0x06:
            return None
        pos += ln
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0xA0:
            return None
        tag, ln, pos = read_tag_length(data, pos)
        if tag != 0x04:
            return None
        return data[pos: pos + ln]
    except Exception:
        return None


def _extract_p7m(content: bytes, original_filename: str | None = None) -> str:
    inner_content = _unwrap_p7m_asn1(content)
    if not inner_content:
        return ""
    inner_ext = ""
    if original_filename:
        fn = original_filename.lower()
        if fn.endswith(".p7m"):
            fn = fn[:-4]
        idx = fn.rfind(".")
        if idx >= 0:
            inner_ext = fn[idx + 1:]
    extractor = _EXTRACTORS_SYNC.get(inner_ext)
    if extractor:
        return extractor(inner_content)
    if inner_content[:4] == b"%PDF":
        return _extract_pdf(inner_content)
    if inner_content[:2] == b"PK":
        for fn_try in (_extract_docx, _extract_plain):
            result = fn_try(inner_content)
            if result.strip():
                return result
    return _extract_plain(inner_content)


_EXTRACTORS_SYNC: dict = {
    "pdf":  _extract_pdf,
    "docx": _extract_docx,
    "doc":  _extract_doc,    # usa fallback OLE2 per .doc legacy
    "txt":  _extract_plain,
    "csv":  _extract_plain,
    "xml":  _extract_plain,
    "html": _extract_plain,
    "htm":  _extract_plain,
    "json": _extract_plain,
    "md":   _extract_plain,  # Markdown e' testo semplice
    "eml":  _extract_eml,
    "msg":  _extract_eml,
    "p7m":  _extract_p7m,
    # Immagini (OCR)
    "png":  _extract_image_ocr,
    "jpg":  _extract_image_ocr,
    "jpeg": _extract_image_ocr,
    "tiff": _extract_image_ocr,
    "tif":  _extract_image_ocr,
    "bmp":  _extract_image_ocr,
    "gif":  _extract_image_ocr,
    "webp": _extract_image_ocr,
}


def _resolve_extractor(content_type: str | None, filename: str | None):
    """Ritorna la funzione estrattore appropriata, o None."""
    e = _ext(filename)
    if e in _EXTRACTORS_SYNC:
        return _EXTRACTORS_SYNC[e]
    ct = (content_type or "").lower()
    _ct_map = {
        "application/pdf": "pdf",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
        "application/msword": "doc",
        "application/vnd.ms-word": "doc",
        "text/plain": "txt",
        "text/csv": "csv",
        "text/xml": "xml",
        "application/xml": "xml",
        "text/html": "html",
        "text/markdown": "md",
        "message/rfc822": "eml",
        "application/pkcs7-mime": "p7m",
        "application/x-pkcs7-mime": "p7m",
        # Immagini (OCR)
        "image/png":  "png",
        "image/jpeg": "jpeg",
        "image/tiff": "tiff",
        "image/bmp":  "bmp",
        "image/gif":  "gif",
        "image/webp": "webp",
    }
    mapped = _ct_map.get(ct)
    if mapped:
        return _EXTRACTORS_SYNC.get(mapped)
    return None


def _is_extractable(content_type: str | None, filename: str | None) -> bool:
    e = _ext(filename)
    if e in _SUPPORTED_EXTENSIONS:
        return True
    ct = (content_type or "").lower()
    return ct in _SUPPORTED_CONTENT_TYPES


# ─── Servizio ──────────────────────────────────────────────────────────────────

class IndexingService:
    """Gestisce le operazioni di indicizzazione full-text per un tenant."""

    def __init__(self, db: AsyncSession) -> None:
        self.db = db

    # ── Statistiche ──────────────────────────────────────────────────────────

    async def get_stats(self, tenant_id: uuid.UUID) -> dict:
        """
        Restituisce le statistiche di copertura dell'indicizzazione per il tenant.
        """
        from app.models.message import Attachment, Message

        total_q = await self.db.execute(
            select(func.count(Message.id)).where(
                Message.tenant_id == tenant_id,
                Message.parent_message_id.is_(None),
            )
        )
        total_messages: int = total_q.scalar_one()

        indexed_q = await self.db.execute(
            select(func.count(Message.id)).where(
                Message.tenant_id == tenant_id,
                Message.parent_message_id.is_(None),
                Message.search_vector.isnot(None),
            )
        )
        indexed_messages: int = indexed_q.scalar_one()

        _supported_content_types_list = list(_SUPPORTED_CONTENT_TYPES)
        _supported_extensions_like = [f"%.{e}" for e in _SUPPORTED_EXTENSIONS]

        from sqlalchemy import or_
        ext_conditions = [Attachment.filename.ilike(ext) for ext in _supported_extensions_like]

        att_total_q = await self.db.execute(
            select(func.count(Attachment.id)).where(
                Attachment.tenant_id == tenant_id,
                or_(
                    Attachment.content_type.in_(_supported_content_types_list),
                    *ext_conditions,
                ),
            )
        )
        attachments_total: int = att_total_q.scalar_one()

        att_extracted_q = await self.db.execute(
            select(func.count(Attachment.id)).where(
                Attachment.tenant_id == tenant_id,
                Attachment.extracted_text.isnot(None),
            )
        )
        attachments_extracted: int = att_extracted_q.scalar_one()

        unindexed_messages = total_messages - indexed_messages
        coverage_pct = (
            round(indexed_messages / total_messages * 100, 1)
            if total_messages > 0
            else 100.0
        )
        attachments_pct = (
            round(attachments_extracted / attachments_total * 100, 1)
            if attachments_total > 0
            else 100.0
        )

        return {
            "total_messages": total_messages,
            "indexed_messages": indexed_messages,
            "unindexed_messages": unindexed_messages,
            "coverage_pct": coverage_pct,
            "attachments_total": attachments_total,
            "attachments_extracted": attachments_extracted,
            "attachments_pct": attachments_pct,
        }

    # ── Stato job reindex ─────────────────────────────────────────────────────

    @staticmethod
    async def get_job_status(tenant_id: uuid.UUID, redis_url: str) -> dict:
        """Legge lo stato del job di reindex da Redis."""
        return await IndexingService._read_job_state(
            _redis_key(tenant_id), redis_url
        )

    # ── Stato job rescan ──────────────────────────────────────────────────────

    @staticmethod
    async def get_rescan_status(tenant_id: uuid.UUID, redis_url: str) -> dict:
        """Legge lo stato del job di rescan allegati da Redis."""
        return await IndexingService._read_job_state(
            _redis_rescan_key(tenant_id), redis_url
        )

    @staticmethod
    async def _read_job_state(redis_key_str: str, redis_url: str) -> dict:
        """Helper generico: legge uno stato job da Redis."""
        import redis.asyncio as aioredis

        client = aioredis.from_url(redis_url, decode_responses=True)
        try:
            raw = await client.get(redis_key_str)
        finally:
            await client.aclose()

        if not raw:
            return {
                "status": "idle",
                "mode": None,
                "total": 0,
                "processed": 0,
                "progress_pct": 0.0,
                "started_at": None,
                "finished_at": None,
                "started_by": None,
                "elapsed_seconds": None,
                "is_stale": False,
                "error": None,
            }

        try:
            data: dict = json.loads(raw)
        except json.JSONDecodeError:
            return {"status": "idle"}

        is_stale = False
        elapsed_seconds = None
        if data.get("started_at"):
            try:
                started = datetime.fromisoformat(data["started_at"])
                finished_str = data.get("finished_at")
                ref_time = (
                    datetime.fromisoformat(finished_str)
                    if finished_str
                    else datetime.now(timezone.utc)
                )
                elapsed_seconds = int((ref_time - started).total_seconds())
                if data.get("status") == "running":
                    elapsed_hours = elapsed_seconds / 3600
                    is_stale = elapsed_hours >= STALE_THRESHOLD_HOURS
            except Exception:
                pass

        total = data.get("total", 0)
        processed = data.get("processed", 0)
        progress_pct = round(processed / total * 100, 1) if total > 0 else 0.0

        return {
            "status": data.get("status", "idle"),
            "mode": data.get("mode"),
            "total": total,
            "processed": processed,
            "progress_pct": progress_pct,
            "started_at": data.get("started_at"),
            "finished_at": data.get("finished_at"),
            "started_by": data.get("started_by"),
            "elapsed_seconds": elapsed_seconds,
            "is_stale": is_stale,
            "error": data.get("error"),
        }

    # ── Avvio reindex ─────────────────────────────────────────────────────────

    @staticmethod
    async def start_reindex(
        tenant_id: uuid.UUID,
        mode: ReindexMode,
        started_by_email: str,
        redis_url: str,
        db_url: str,
    ) -> None:
        import redis.asyncio as aioredis

        client = aioredis.from_url(redis_url, decode_responses=True)
        try:
            raw = await client.get(_redis_key(tenant_id))
            if raw:
                data = json.loads(raw)
                if data.get("status") == "running":
                    raise ValueError("Un job di reindex e' gia' in corso per questo tenant")
            # Controlla anche se il rescan e' in corso
            raw_rescan = await client.get(_redis_rescan_key(tenant_id))
            if raw_rescan:
                data_rescan = json.loads(raw_rescan)
                if data_rescan.get("status") == "running":
                    raise ValueError(
                        "Un job di scansione allegati e' in corso. "
                        "Attendi il termine prima di avviare il reindex."
                    )
        finally:
            await client.aclose()

        await IndexingService._set_state(
            _redis_key(tenant_id),
            redis_url,
            {
                "status": "running",
                "mode": mode,
                "total": 0,
                "processed": 0,
                "started_at": datetime.now(timezone.utc).isoformat(),
                "finished_at": None,
                "started_by": started_by_email,
                "error": None,
            },
        )

        asyncio.create_task(
            IndexingService._run_reindex_bg(tenant_id, mode, redis_url, db_url),
            name=f"reindex-{tenant_id}",
        )

    # ── Avvio rescan allegati ─────────────────────────────────────────────────

    @staticmethod
    async def start_rescan(
        tenant_id: uuid.UUID,
        started_by_email: str,
        redis_url: str,
        db_url: str,
        force: bool = False,
    ) -> None:
        """
        Avvia il job di rescan allegati in background.

        force=False: processa solo allegati con extracted_text IS NULL
        force=True:  processa tutti gli allegati (ri-estrae anche quelli gia' estratti)
        """
        import redis.asyncio as aioredis

        client = aioredis.from_url(redis_url, decode_responses=True)
        try:
            raw = await client.get(_redis_rescan_key(tenant_id))
            if raw:
                data = json.loads(raw)
                if data.get("status") == "running":
                    raise ValueError("Un job di scansione allegati e' gia' in corso per questo tenant")
            # Controlla anche se il reindex e' in corso
            raw_reindex = await client.get(_redis_key(tenant_id))
            if raw_reindex:
                data_reindex = json.loads(raw_reindex)
                if data_reindex.get("status") == "running":
                    raise ValueError(
                        "Un job di reindex e' in corso. "
                        "Attendi il termine prima di avviare la scansione allegati."
                    )
        finally:
            await client.aclose()

        mode_label = "force" if force else "differential"

        await IndexingService._set_state(
            _redis_rescan_key(tenant_id),
            redis_url,
            {
                "status": "running",
                "mode": mode_label,
                "total": 0,
                "processed": 0,
                "started_at": datetime.now(timezone.utc).isoformat(),
                "finished_at": None,
                "started_by": started_by_email,
                "error": None,
            },
        )

        asyncio.create_task(
            IndexingService._run_rescan_bg(tenant_id, force, redis_url, db_url),
            name=f"rescan-{tenant_id}",
        )

    # ── Cancellazione reindex ─────────────────────────────────────────────────

    @staticmethod
    async def cancel_reindex(tenant_id: uuid.UUID, redis_url: str) -> bool:
        import redis.asyncio as aioredis

        client = aioredis.from_url(redis_url, decode_responses=True)
        try:
            raw = await client.get(_redis_key(tenant_id))
            if raw:
                data = json.loads(raw)
                if data.get("status") == "running":
                    await client.setex(
                        _redis_cancel_key(tenant_id),
                        REDIS_TTL_CANCEL,
                        "1",
                    )
                    return True
        finally:
            await client.aclose()
        return False

    # ── Cancellazione rescan ──────────────────────────────────────────────────

    @staticmethod
    async def cancel_rescan(tenant_id: uuid.UUID, redis_url: str) -> bool:
        import redis.asyncio as aioredis

        client = aioredis.from_url(redis_url, decode_responses=True)
        try:
            raw = await client.get(_redis_rescan_key(tenant_id))
            if raw:
                data = json.loads(raw)
                if data.get("status") == "running":
                    await client.setex(
                        _redis_rescan_cancel_key(tenant_id),
                        REDIS_TTL_CANCEL,
                        "1",
                    )
                    return True
        finally:
            await client.aclose()
        return False

    # ── Helpers Redis ─────────────────────────────────────────────────────────

    @staticmethod
    async def _set_state(redis_key_str: str, redis_url: str, state: dict) -> None:
        import redis.asyncio as aioredis

        client = aioredis.from_url(redis_url, decode_responses=True)
        try:
            await client.setex(
                redis_key_str,
                REDIS_TTL_STATUS,
                json.dumps(state, default=str),
            )
        finally:
            await client.aclose()

    @staticmethod
    async def _check_cancel_flag(cancel_key: str, redis_url: str) -> bool:
        import redis.asyncio as aioredis

        client = aioredis.from_url(redis_url, decode_responses=True)
        try:
            flag = await client.get(cancel_key)
            return flag == "1"
        finally:
            await client.aclose()

    # Alias per retrocompatibilita' con il codice esistente
    @staticmethod
    async def _set_job_state(tenant_id: uuid.UUID, redis_url: str, state: dict) -> None:
        await IndexingService._set_state(_redis_key(tenant_id), redis_url, state)

    @staticmethod
    async def _check_cancel(tenant_id: uuid.UUID, redis_url: str) -> bool:
        return await IndexingService._check_cancel_flag(
            _redis_cancel_key(tenant_id), redis_url
        )

    # ── Logica interna reindex ─────────────────────────────────────────────────

    @staticmethod
    async def _run_reindex_bg(
        tenant_id: uuid.UUID,
        mode: ReindexMode,
        redis_url: str,
        db_url: str,
    ) -> None:
        from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
        from sqlalchemy.orm import sessionmaker

        log = logging.getLogger(__name__)
        log.info(f"Avvio reindex {mode} per tenant {tenant_id}")

        engine = create_async_engine(db_url, echo=False)
        AsyncSessionFactory = sessionmaker(  # type: ignore[call-overload]
            engine, class_=AsyncSession, expire_on_commit=False
        )

        state: dict = {
            "status": "running",
            "mode": mode,
            "total": 0,
            "processed": 0,
            "started_at": None,
            "finished_at": None,
            "started_by": None,
            "error": None,
        }

        try:
            async with AsyncSessionFactory() as db:
                from app.models.message import Message

                count_q = select(func.count(Message.id)).where(
                    Message.tenant_id == tenant_id,
                    Message.parent_message_id.is_(None),
                )
                if mode == "differential":
                    count_q = count_q.where(Message.search_vector.is_(None))

                total: int = (await db.execute(count_q)).scalar_one()

                import redis.asyncio as aioredis
                redis_client = aioredis.from_url(redis_url, decode_responses=True)
                try:
                    raw = await redis_client.get(_redis_key(tenant_id))
                    if raw:
                        state = json.loads(raw)
                    state["total"] = total
                    state["processed"] = 0
                    await redis_client.setex(
                        _redis_key(tenant_id),
                        REDIS_TTL_STATUS,
                        json.dumps(state, default=str),
                    )
                finally:
                    await redis_client.aclose()

                log.info(f"Reindex {mode}: {total} messaggi da processare")

                if total == 0:
                    state.update({
                        "status": "completed",
                        "processed": 0,
                        "finished_at": datetime.now(timezone.utc).isoformat(),
                    })
                    await IndexingService._set_job_state(tenant_id, redis_url, state)
                    return

                ids_q = select(Message.id).where(
                    Message.tenant_id == tenant_id,
                    Message.parent_message_id.is_(None),
                )
                if mode == "differential":
                    ids_q = ids_q.where(Message.search_vector.is_(None))

                ids_result = await db.execute(ids_q)
                all_ids = [str(row[0]) for row in ids_result.fetchall()]

                processed = 0
                for batch_start in range(0, len(all_ids), BATCH_SIZE):
                    if await IndexingService._check_cancel(tenant_id, redis_url):
                        log.info(f"Reindex {mode} annullato al batch {batch_start}")
                        state.update({
                            "status": "cancelled",
                            "processed": processed,
                            "finished_at": datetime.now(timezone.utc).isoformat(),
                        })
                        await IndexingService._set_job_state(tenant_id, redis_url, state)
                        return

                    batch_ids = all_ids[batch_start: batch_start + BATCH_SIZE]

                    await db.execute(
                        text("""
                            UPDATE messages
                            SET search_vector =
                                setweight(to_tsvector('italian', coalesce(subject, '')), 'A') ||
                                setweight(to_tsvector('simple',  coalesce(from_address, '')), 'B') ||
                                setweight(to_tsvector('simple',
                                    coalesce(array_to_string(to_addresses, ' '), '')), 'B') ||
                                setweight(to_tsvector('italian', coalesce(body_text, '')), 'C') ||
                                setweight(to_tsvector('italian', coalesce((
                                    SELECT string_agg(a.extracted_text, ' ')
                                    FROM attachments a
                                    WHERE a.message_id = messages.id
                                      AND a.extracted_text IS NOT NULL
                                ), '')), 'D')
                            WHERE id = ANY(:ids)
                        """),
                        {"ids": batch_ids},
                    )
                    await db.commit()

                    processed += len(batch_ids)
                    state["processed"] = processed
                    await IndexingService._set_job_state(tenant_id, redis_url, state)

                    await asyncio.sleep(0.05)

                state.update({
                    "status": "completed",
                    "processed": processed,
                    "finished_at": datetime.now(timezone.utc).isoformat(),
                })
                await IndexingService._set_job_state(tenant_id, redis_url, state)
                log.info(f"Reindex {mode} completato: {processed}/{total} messaggi")

        except Exception as exc:
            log.error(f"Errore reindex {mode} tenant {tenant_id}: {exc}", exc_info=True)
            state.update({
                "status": "failed",
                "finished_at": datetime.now(timezone.utc).isoformat(),
                "error": str(exc),
            })
            await IndexingService._set_job_state(tenant_id, redis_url, state)
        finally:
            await engine.dispose()

    # ── Logica interna rescan allegati ────────────────────────────────────────

    @staticmethod
    async def _run_rescan_bg(
        tenant_id: uuid.UUID,
        force: bool,
        redis_url: str,
        db_url: str,
    ) -> None:
        """
        Coroutine di rescan allegati eseguita in background.

        Algoritmo:
        1. Trova gli allegati del tenant con formato supportato
           (solo quelli con extracted_text IS NULL se force=False)
        2. Per ogni batch: scarica da MinIO, estrae testo, aggiorna extracted_text
        3. Dopo ogni batch: ricostruisce search_vector per i messaggi interessati
        4. Aggiorna progresso in Redis dopo ogni batch
        5. Controlla flag di cancellazione tra un batch e l'altro
        """
        from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
        from sqlalchemy.orm import sessionmaker

        log = logging.getLogger(__name__)
        mode_str = "force" if force else "differential"
        log.info(f"Avvio rescan allegati {mode_str} per tenant {tenant_id}")

        engine = create_async_engine(db_url, echo=False)
        AsyncSessionFactory = sessionmaker(  # type: ignore[call-overload]
            engine, class_=AsyncSession, expire_on_commit=False
        )

        state: dict = {
            "status": "running",
            "mode": mode_str,
            "total": 0,
            "processed": 0,
            "started_at": None,
            "finished_at": None,
            "started_by": None,
            "error": None,
        }

        try:
            async with AsyncSessionFactory() as db:
                from app.config import get_settings
                from app.models.message import Attachment

                settings = get_settings()

                # ── 1. Conta allegati da processare ───────────────────────────
                from sqlalchemy import or_

                ext_conditions = [
                    Attachment.filename.ilike(f"%.{e}") for e in _SUPPORTED_EXTENSIONS
                ]
                base_filter = [
                    Attachment.tenant_id == tenant_id,
                    or_(
                        Attachment.content_type.in_(list(_SUPPORTED_CONTENT_TYPES)),
                        *ext_conditions,
                    ),
                ]
                if not force:
                    base_filter.append(Attachment.extracted_text.is_(None))

                count_q = await db.execute(
                    select(func.count(Attachment.id)).where(*base_filter)
                )
                total: int = count_q.scalar_one()

                # Aggiorna totale in Redis
                import redis.asyncio as aioredis
                redis_client = aioredis.from_url(redis_url, decode_responses=True)
                try:
                    raw = await redis_client.get(_redis_rescan_key(tenant_id))
                    if raw:
                        state = json.loads(raw)
                    state["total"] = total
                    state["processed"] = 0
                    await redis_client.setex(
                        _redis_rescan_key(tenant_id),
                        REDIS_TTL_STATUS,
                        json.dumps(state, default=str),
                    )
                finally:
                    await redis_client.aclose()

                log.info(f"Rescan {mode_str}: {total} allegati da processare")

                if total == 0:
                    state.update({
                        "status": "completed",
                        "processed": 0,
                        "finished_at": datetime.now(timezone.utc).isoformat(),
                    })
                    await IndexingService._set_state(
                        _redis_rescan_key(tenant_id), redis_url, state
                    )
                    return

                # ── 2. Recupera IDs allegati da processare ────────────────────
                ids_q = select(Attachment.id).where(*base_filter)
                ids_result = await db.execute(ids_q)
                all_att_ids = [row[0] for row in ids_result.fetchall()]

                # ── 3. Crea client MinIO ───────────────────────────────────────
                try:
                    from miniopy_async import Minio  # type: ignore[import]
                    minio = Minio(
                        endpoint=settings.minio_endpoint,
                        access_key=settings.minio_access_key,
                        secret_key=settings.minio_secret_key,
                        secure=settings.minio_use_ssl,
                    )
                    bucket = settings.minio_bucket
                except Exception as e:
                    raise RuntimeError(f"Impossibile creare client MinIO: {e}") from e

                # ── 4. Processa in batch ───────────────────────────────────────
                processed = 0
                for batch_start in range(0, len(all_att_ids), RESCAN_BATCH_SIZE):
                    # Controlla cancellazione
                    if await IndexingService._check_cancel_flag(
                        _redis_rescan_cancel_key(tenant_id), redis_url
                    ):
                        log.info(f"Rescan {mode_str} annullato al batch {batch_start}")
                        state.update({
                            "status": "cancelled",
                            "processed": processed,
                            "finished_at": datetime.now(timezone.utc).isoformat(),
                        })
                        await IndexingService._set_state(
                            _redis_rescan_key(tenant_id), redis_url, state
                        )
                        return

                    batch_ids = all_att_ids[batch_start: batch_start + RESCAN_BATCH_SIZE]

                    # Carica allegati del batch
                    att_result = await db.execute(
                        select(Attachment).where(Attachment.id.in_(batch_ids))
                    )
                    attachments = list(att_result.scalars().all())

                    affected_message_ids: set[str] = set()

                    for att in attachments:
                        extractor = _resolve_extractor(att.content_type, att.filename)
                        if extractor is None:
                            continue

                        try:
                            response = await minio.get_object(bucket, att.storage_path)
                            content = await response.content.read()
                            response.close()
                        except Exception as e:
                            log.warning(
                                f"Impossibile scaricare allegato {att.id} "
                                f"({att.filename!r}) da MinIO: {e}"
                            )
                            continue

                        try:
                            e_name = _ext(att.filename)
                            if e_name == "p7m":
                                extracted = _extract_p7m(content, att.filename)
                            else:
                                extracted = extractor(content)  # type: ignore[operator]
                        except Exception as ex:
                            log.debug(f"Errore estrazione {att.filename!r}: {ex}")
                            continue

                        if not extracted or not extracted.strip():
                            continue

                        att.extracted_text = extracted[:MAX_EXTRACTED_TEXT_LEN]
                        affected_message_ids.add(str(att.message_id))
                        log.debug(
                            f"Testo estratto da {att.filename!r}: "
                            f"{len(att.extracted_text)} caratteri"
                        )

                    await db.flush()

                    # Ricostruisce search_vector per i messaggi interessati
                    if affected_message_ids:
                        await db.execute(
                            text("""
                                UPDATE messages
                                SET search_vector =
                                    setweight(to_tsvector('italian', coalesce(subject, '')), 'A') ||
                                    setweight(to_tsvector('simple',  coalesce(from_address, '')), 'B') ||
                                    setweight(to_tsvector('simple',
                                        coalesce(array_to_string(to_addresses, ' '), '')), 'B') ||
                                    setweight(to_tsvector('italian', coalesce(body_text, '')), 'C') ||
                                    setweight(to_tsvector('italian', coalesce((
                                        SELECT string_agg(a.extracted_text, ' ')
                                        FROM attachments a
                                        WHERE a.message_id = messages.id
                                          AND a.extracted_text IS NOT NULL
                                    ), '')), 'D')
                                WHERE id = ANY(:ids)
                            """),
                            {"ids": list(affected_message_ids)},
                        )

                    await db.commit()

                    processed += len(batch_ids)
                    state["processed"] = processed
                    await IndexingService._set_state(
                        _redis_rescan_key(tenant_id), redis_url, state
                    )

                    await asyncio.sleep(0.1)

                # ── 5. Completato ──────────────────────────────────────────────
                state.update({
                    "status": "completed",
                    "processed": processed,
                    "finished_at": datetime.now(timezone.utc).isoformat(),
                })
                await IndexingService._set_state(
                    _redis_rescan_key(tenant_id), redis_url, state
                )
                log.info(f"Rescan {mode_str} completato: {processed}/{total} allegati")

        except Exception as exc:
            log.error(f"Errore rescan tenant {tenant_id}: {exc}", exc_info=True)
            state.update({
                "status": "failed",
                "finished_at": datetime.now(timezone.utc).isoformat(),
                "error": str(exc),
            })
            await IndexingService._set_state(
                _redis_rescan_key(tenant_id), redis_url, state
            )
        finally:
            await engine.dispose()