"""
Indicizzazione full-text dei messaggi PEC.

Responsabilita':
  1. Scarica gli allegati PDF e DOCX da MinIO
  2. Estrae il testo con pypdf (PDF) e python-docx (DOCX)
  3. Aggiorna la colonna extracted_text in attachments
  4. Aggiorna la colonna search_vector in messages includendo il testo degli allegati

Viene chiamato alla fine di _save_message in sync.py, in modo non bloccante:
un'eccezione qui non interrompe la sincronizzazione del messaggio.
"""

import io
import logging
import uuid

from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession

logger = logging.getLogger(__name__)

# Dimensione massima del testo estratto per allegato (caratteri)
MAX_EXTRACTED_TEXT_LEN = 50_000
# Dimensione massima del testo aggregato degli allegati per il search_vector
MAX_COMBINED_TEXT_LEN = 200_000


# ─── Estrazione testo ─────────────────────────────────────────────────────────

def _extract_pdf_text(content: bytes) -> str:
    """Estrae testo da un PDF usando pypdf."""
    try:
        import pypdf  # type: ignore[import]

        reader = pypdf.PdfReader(io.BytesIO(content))
        parts: list[str] = []
        for page in reader.pages:
            try:
                txt = page.extract_text()
                if txt:
                    parts.append(txt)
            except Exception:
                continue
        return " ".join(parts)
    except ImportError:
        logger.warning("pypdf non installato: impossibile estrarre testo da PDF")
        return ""
    except Exception as e:
        logger.debug(f"Errore estrazione testo PDF: {e}")
        return ""


def _extract_docx_text(content: bytes) -> str:
    """Estrae testo da un DOCX usando python-docx."""
    try:
        import docx  # type: ignore[import]

        doc = docx.Document(io.BytesIO(content))
        parts = [para.text for para in doc.paragraphs if para.text and para.text.strip()]
        return " ".join(parts)
    except ImportError:
        logger.warning("python-docx non installato: impossibile estrarre testo da DOCX")
        return ""
    except Exception as e:
        logger.debug(f"Errore estrazione testo DOCX: {e}")
        return ""


def _is_pdf(content_type: str | None, filename: str | None) -> bool:
    ct = (content_type or "").lower()
    fn = (filename or "").lower()
    return ct == "application/pdf" or fn.endswith(".pdf")


def _is_docx(content_type: str | None, filename: str | None) -> bool:
    ct = (content_type or "").lower()
    fn = (filename or "").lower()
    return ct in (
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/msword",
        "application/vnd.ms-word",
    ) or fn.endswith((".docx", ".doc"))


# ─── Job principale ───────────────────────────────────────────────────────────

async def index_message(
    message_id: uuid.UUID,
    db: AsyncSession,
) -> None:
    """
    Indicizza un messaggio per la ricerca full-text.

    Non solleva eccezioni: tutti gli errori vengono loggati ma non propagati,
    per non interrompere il flusso di sincronizzazione.
    """
    try:
        await _do_index_message(message_id, db)
    except Exception as e:
        logger.error(
            f"Errore indicizzazione messaggio {message_id}: {e}",
            exc_info=True,
        )


async def _do_index_message(
    message_id: uuid.UUID,
    db: AsyncSession,
) -> None:
    """Logica interna di indicizzazione (puo' sollevare eccezioni)."""
    from app.config import get_settings
    from app.models import Attachment, Message

    settings = get_settings()

    # ── Carica il messaggio ───────────────────────────────────────────────────
    msg_result = await db.execute(
        select(Message).where(Message.id == message_id)
    )
    message = msg_result.scalar_one_or_none()
    if not message:
        logger.warning(f"index_message: messaggio {message_id} non trovato in DB")
        return

    # ── Carica gli allegati ───────────────────────────────────────────────────
    att_result = await db.execute(
        select(Attachment).where(Attachment.message_id == message_id)
    )
    attachments = list(att_result.scalars().all())

    if not attachments:
        logger.debug(f"Messaggio {message_id}: nessun allegato, skip indicizzazione allegati")
        return

    # ── Crea client MinIO ─────────────────────────────────────────────────────
    try:
        from miniopy_async import Minio  # type: ignore[import]

        minio = Minio(
            endpoint=settings.minio_endpoint,
            access_key=settings.minio_access_key,
            secret_key=settings.minio_secret_key,
            secure=settings.minio_use_ssl,
        )
    except Exception as e:
        logger.warning(f"Impossibile creare client MinIO per indicizzazione {message_id}: {e}")
        return

    bucket = settings.minio_bucket
    attachment_texts: list[str] = []
    indexed_count = 0

    for att in attachments:
        # Se gia' indicizzato, usa il testo cached
        if att.extracted_text is not None:
            attachment_texts.append(att.extracted_text)
            continue

        # Controlla se e' un PDF o DOCX
        if not (_is_pdf(att.content_type, att.filename) or _is_docx(att.content_type, att.filename)):
            continue

        # Scarica da MinIO
        try:
            response = await minio.get_object(bucket, att.storage_path)
            content = await response.content.read()
            response.close()
        except Exception as e:
            logger.warning(
                f"Impossibile scaricare allegato {att.id} "
                f"({att.filename!r}) da MinIO: {e}"
            )
            continue

        # Estrai testo
        if _is_pdf(att.content_type, att.filename):
            extracted = _extract_pdf_text(content)
        else:
            extracted = _extract_docx_text(content)

        if not extracted or not extracted.strip():
            continue

        # Limita la dimensione e salva
        att.extracted_text = extracted[:MAX_EXTRACTED_TEXT_LEN]
        attachment_texts.append(att.extracted_text)
        indexed_count += 1

    # ── Aggiorna search_vector includendo il testo degli allegati ─────────────
    if attachment_texts:
        combined = " ".join(attachment_texts)[:MAX_COMBINED_TEXT_LEN]

        await db.execute(
            text("""
                UPDATE messages
                SET search_vector =
                    setweight(to_tsvector('italian', coalesce(subject, '')), 'A') ||
                    setweight(to_tsvector('simple',  coalesce(from_address, '')), 'B') ||
                    setweight(to_tsvector('simple',
                        coalesce(array_to_string(to_addresses, ' '), '')), 'B') ||
                    setweight(to_tsvector('italian', coalesce(body_text, '')), 'C') ||
                    setweight(to_tsvector('italian', :att_text), 'D')
                WHERE id = :message_id
            """),
            {"att_text": combined, "message_id": str(message_id)},
        )

        await db.flush()

        logger.info(
            f"Indicizzazione completata: messaggio {message_id}, "
            f"{indexed_count} allegati indicizzati su {len(attachments)} totali"
        )
    else:
        logger.debug(
            f"Messaggio {message_id}: nessun allegato PDF/DOCX con testo estraibile"
        )