Files
PecHub/worker/app/imap/sync.py
T
2026-03-18 17:43:03 +01:00

545 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Logica di sincronizzazione messaggi IMAP Fase 3 aggiornata.
Responsabilità:
1. Fetch della lista UID > last_sync_uid
2. Download envelope + raw EML per ogni UID
3. Parsing completo EML tramite app.parsers (Fase 3):
- Classificazione tipo PEC (X-Ricevuta / X-TipoRicevuta)
- Estrazione allegati (body text/html + allegati file)
- EML-in-EML per ricevute PEC
4. Salvataggio messaggio in tabella messages
5. Upload raw EML su MinIO
6. Upload allegati su MinIO + inserimento in tabella attachments
7. State machine messaggi outbound (sent→accepted→delivered/anomaly)
tramite X-Riferimento-Message-ID
8. Aggiornamento last_sync_uid e last_sync_at sulla mailbox
9. Pubblicazione evento Redis per notifica WebSocket
"""
import email
import email.header
import email.utils
import json
import logging
import re
import uuid
from datetime import UTC, datetime
import aioimaplib
import redis.asyncio as aioredis
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import get_settings
from app.models import Attachment, Mailbox, Message
from app.parsers.eml_parser import parse_eml
from app.parsers.pec_parser import apply_outbound_transition, classify_pec_message
from app.storage.minio_client import upload_attachment, upload_eml
logger = logging.getLogger(__name__)
settings = get_settings()
# ─── Helper legacy (mantenuti per backward compatibility con i test) ──────────
def _decode_header(header_value: str | None) -> str | None:
"""Decodifica header RFC 2047 (es. =?utf-8?b?...?=) in stringa Python."""
if not header_value:
return None
try:
parts = email.header.decode_header(header_value)
decoded = []
for part, charset in parts:
if isinstance(part, bytes):
decoded.append(part.decode(charset or "utf-8", errors="replace"))
else:
decoded.append(part)
return "".join(decoded).strip()
except Exception:
return str(header_value)
def _extract_addresses(field: str | None) -> list[str]:
"""Estrae lista di indirizzi email da un campo To/Cc."""
if not field:
return []
try:
addresses = email.utils.getaddresses([field])
return [addr for _, addr in addresses if addr]
except Exception:
return []
def _parse_date(date_str: str | None) -> datetime | None:
"""Converte stringa data RFC 2822 in datetime con timezone."""
if not date_str:
return None
try:
parsed = email.utils.parsedate_to_datetime(date_str)
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=UTC)
return parsed
except Exception:
return None
def _classify_pec_type(msg: email.message.Message) -> str:
"""
Classifica il tipo PEC dal header X-Ricevuta / X-TipoRicevuta.
Mantenuto per backward compatibility usa il parser completo internamente.
"""
pec_class = classify_pec_message(msg)
return pec_class.pec_type
def _parse_eml(raw_bytes: bytes) -> dict:
"""
Parsing di base di un EML.
Wrapper del nuovo parser completo (Fase 3).
Mantenuto per backward compatibility con i test esistenti.
Restituisce un dict con i campi base necessari per la tabella messages.
"""
parsed = parse_eml(raw_bytes)
pec_type = "posta_certificata"
if parsed.raw_message is not None:
pec_class = classify_pec_message(parsed.raw_message)
pec_type = pec_class.pec_type
return {
"subject": parsed.subject,
"from_address": parsed.from_address,
"to_addresses": parsed.to_addresses if parsed.to_addresses else None,
"cc_addresses": parsed.cc_addresses if parsed.cc_addresses else None,
"message_id_header": parsed.message_id,
"sent_at": parsed.date,
"pec_type": pec_type,
"body_text": parsed.body_text,
"body_html": parsed.body_html,
"has_attachments": parsed.has_attachments,
}
# ─── Core sync function ───────────────────────────────────────────────────────
async def sync_new_messages(
imap_client: aioimaplib.IMAP4 | aioimaplib.IMAP4_SSL,
mailbox: Mailbox,
db: AsyncSession,
redis_client: aioredis.Redis,
) -> int:
"""
Sincronizza i messaggi nuovi (UID > last_sync_uid) per la mailbox data.
Returns:
Numero di nuovi messaggi sincronizzati.
"""
last_uid = mailbox.last_sync_uid or 0
search_range = f"{last_uid + 1}:*"
# ── SEARCH UID > last_sync_uid ─────────────────────────────────────────────
try:
status, search_data = await imap_client.search("UID", search_range)
except Exception as e:
logger.warning(f"[{mailbox.email_address}] SEARCH fallito: {e}")
return 0
if status != "OK":
logger.warning(
f"[{mailbox.email_address}] SEARCH status={status} data={search_data}"
)
return 0
raw_seqs = b" ".join(
d if isinstance(d, bytes) else d.encode() for d in search_data
).decode("ascii", errors="ignore").split()
seq_numbers = [s for s in raw_seqs if s.isdigit()]
if not seq_numbers:
return 0
seq_numbers = seq_numbers[: settings.imap_max_fetch_per_cycle]
logger.info(
f"[{mailbox.email_address}] Trovati {len(seq_numbers)} messaggi nuovi da sincronizzare"
)
synced_count = 0
max_uid_synced = last_uid
for seq in seq_numbers:
try:
uid, synced = await _fetch_and_save_message_by_seq(
imap_client=imap_client,
seq=seq,
last_uid=last_uid,
mailbox=mailbox,
db=db,
redis_client=redis_client,
)
if synced and uid and uid > max_uid_synced:
synced_count += 1
max_uid_synced = uid
except Exception as e:
logger.error(
f"[{mailbox.email_address}] Errore fetch seq {seq}: {e}",
exc_info=True,
)
# Aggiorna last_sync_uid e last_sync_at
if max_uid_synced > last_uid:
mailbox.last_sync_uid = max_uid_synced
mailbox.last_sync_at = datetime.now(UTC)
await db.flush()
await db.commit()
return synced_count
async def _fetch_and_save_message_by_seq(
imap_client: aioimaplib.IMAP4 | aioimaplib.IMAP4_SSL,
seq: str,
last_uid: int,
mailbox: Mailbox,
db: AsyncSession,
redis_client: aioredis.Redis,
) -> tuple[int | None, bool]:
"""
Fetcha un singolo messaggio per NUMERO DI SEQUENZA (non UID).
Returns:
(uid, saved): UID del messaggio e True se salvato, False altrimenti.
"""
try:
status, fetch_data = await imap_client.fetch(seq, "(UID RFC822 RFC822.SIZE)")
except Exception as e:
logger.error(f"[{mailbox.email_address}] FETCH seq {seq} fallito: {e}")
return None, False
if status != "OK" or not fetch_data:
logger.warning(
f"[{mailbox.email_address}] FETCH seq {seq} risposta vuota: {status}"
)
return None, False
items_info = [(type(x).__name__, len(x) if isinstance(x, (bytes, str)) else str(x)) for x in fetch_data]
logger.debug(f"[{mailbox.email_address}] fetch_data seq {seq}: {items_info}")
uid: int | None = None
raw_eml: bytes | None = None
size_bytes: int | None = None
for item in fetch_data:
if isinstance(item, bytearray):
if len(item) > 200:
raw_eml = bytes(item)
elif isinstance(item, bytes):
item_str = item.decode("ascii", errors="ignore")
uid_match = re.search(r"UID\s+(\d+)", item_str)
if uid_match:
uid = int(uid_match.group(1))
size_match = re.search(r"RFC822\.SIZE\s+(\d+)", item_str)
if size_match:
size_bytes = int(size_match.group(1))
elif isinstance(item, str):
uid_match = re.search(r"UID\s+(\d+)", item)
if uid_match:
uid = int(uid_match.group(1))
size_match = re.search(r"RFC822\.SIZE\s+(\d+)", item)
if size_match:
size_bytes = int(size_match.group(1))
if uid is None or uid <= last_uid:
return uid, False
if not raw_eml:
logger.warning(f"[{mailbox.email_address}] seq {seq} UID {uid}: body mancante")
return uid, False
if size_bytes is None:
size_bytes = len(raw_eml)
return uid, await _save_message(
uid=uid,
raw_eml=raw_eml,
size_bytes=size_bytes,
mailbox=mailbox,
db=db,
redis_client=redis_client,
)
async def _fetch_and_save_message(
imap_client: aioimaplib.IMAP4 | aioimaplib.IMAP4_SSL,
uid: int,
mailbox: Mailbox,
db: AsyncSession,
redis_client: aioredis.Redis,
) -> bool:
"""
Fetcha un singolo messaggio per UID (usato dal job sync_mailbox one-shot).
"""
existing = await db.execute(
select(Message.id).where(
Message.mailbox_id == mailbox.id,
Message.imap_uid == uid,
)
)
if existing.scalar_one_or_none():
return False
try:
status, fetch_data = await imap_client.uid("FETCH", str(uid), "(RFC822 RFC822.SIZE)")
except Exception as e:
logger.error(f"[{mailbox.email_address}] UID FETCH {uid} fallito: {e}")
return False
if status != "OK" or not fetch_data:
return False
raw_eml: bytes | None = None
size_bytes: int | None = None
for item in fetch_data:
if isinstance(item, bytes) and len(item) > 100:
raw_eml = item
elif isinstance(item, (bytes, str)):
s = item.decode("ascii", errors="ignore") if isinstance(item, bytes) else item
m = re.search(r"RFC822\.SIZE\s+(\d+)", s)
if m:
size_bytes = int(m.group(1))
if not raw_eml:
return False
return await _save_message(
uid=uid,
raw_eml=raw_eml,
size_bytes=size_bytes or len(raw_eml),
mailbox=mailbox,
db=db,
redis_client=redis_client,
)
# ─── Save message (Fase 3 con parser completo, allegati, state machine) ────
async def _save_message(
uid: int,
raw_eml: bytes,
size_bytes: int,
mailbox: Mailbox,
db: AsyncSession,
redis_client: aioredis.Redis,
) -> bool:
"""
Salva un messaggio EML in DB e su MinIO.
Fase 3 aggiornato per:
- Parser completo (body, allegati, EML-in-EML)
- Classificazione precisa tipo PEC (tutti i provider)
- Salvataggio allegati su MinIO + tabella attachments
- State machine outbound: aggiorna stato messaggio originale alla ricezione ricevuta
- Collegamento parent_message_id via X-Riferimento-Message-ID
"""
# ── Idempotenza ───────────────────────────────────────────────────────────
existing = await db.execute(
select(Message.id).where(
Message.mailbox_id == mailbox.id,
Message.imap_uid == uid,
)
)
if existing.scalar_one_or_none():
logger.debug(f"[{mailbox.email_address}] UID {uid} già in DB, skip")
return False
# ── Parsing completo EML ──────────────────────────────────────────────────
parsed = parse_eml(raw_eml)
pec_class = classify_pec_message(
parsed.raw_message or email.message_from_bytes(raw_eml)
)
received_at = datetime.now(UTC)
# ── State machine: trova e aggiorna messaggio outbound ────────────────────
parent_message_id: uuid.UUID | None = None
if pec_class.is_receipt and pec_class.riferimento_message_id:
parent_message_id = await _apply_outbound_state_machine(
riferimento_message_id=pec_class.riferimento_message_id,
pec_type=pec_class.pec_type,
tenant_id=mailbox.tenant_id,
db=db,
)
# ── Upload raw EML su MinIO ───────────────────────────────────────────────
eml_path: str | None = None
try:
eml_path = await upload_eml(
tenant_id=str(mailbox.tenant_id),
mailbox_id=str(mailbox.id),
uid=uid,
eml_bytes=raw_eml,
)
except Exception as e:
logger.error(f"[{mailbox.email_address}] Upload EML MinIO UID {uid}: {e}")
# ── Salva messaggio in DB ─────────────────────────────────────────────────
message = Message(
id=uuid.uuid4(),
tenant_id=mailbox.tenant_id,
mailbox_id=mailbox.id,
imap_uid=uid,
imap_folder="INBOX",
direction="inbound",
state="received",
pec_type=pec_class.pec_type,
subject=parsed.subject,
from_address=parsed.from_address,
to_addresses=parsed.to_addresses if parsed.to_addresses else None,
cc_addresses=parsed.cc_addresses if parsed.cc_addresses else None,
message_id_header=parsed.message_id,
sent_at=parsed.date,
received_at=received_at,
size_bytes=size_bytes,
body_text=parsed.body_text,
body_html=parsed.body_html,
has_attachments=parsed.has_attachments,
parent_message_id=parent_message_id,
raw_eml_path=eml_path,
is_read=False,
)
db.add(message)
await db.flush() # ottieni message.id prima di salvare gli allegati
# ── Salva allegati su MinIO + tabella attachments ─────────────────────────
if parsed.attachments:
await _save_attachments(
attachments=parsed.attachments,
message=message,
mailbox=mailbox,
db=db,
)
# ── Pubblica evento Redis per WebSocket ───────────────────────────────────
try:
event = {
"type": "mailbox:new_message",
"mailbox_id": str(mailbox.id),
"message_id": str(message.id),
"subject": message.subject or "",
"from_address": message.from_address or "",
"pec_type": message.pec_type,
"is_receipt": pec_class.is_receipt,
"received_at": received_at.isoformat(),
}
await redis_client.publish(f"ws:tenant:{mailbox.tenant_id}", json.dumps(event))
except Exception as e:
logger.warning(f"[{mailbox.email_address}] Redis publish UID {uid}: {e}")
logger.info(
f"[{mailbox.email_address}] Nuovo messaggio: UID={uid} "
f"pec_type={pec_class.pec_type!r} "
f"subject={message.subject!r} "
f"allegati={len(parsed.attachments)}"
)
return True
async def _apply_outbound_state_machine(
riferimento_message_id: str,
pec_type: str,
tenant_id: uuid.UUID,
db: AsyncSession,
) -> uuid.UUID | None:
"""
Aggiorna lo stato del messaggio outbound originale in base alla ricevuta.
Cerca il messaggio outbound con message_id_header == riferimento_message_id,
applica la transizione di stato se valida.
Returns:
UUID del messaggio originale se trovato, None altrimenti.
"""
result = await db.execute(
select(Message).where(
Message.tenant_id == tenant_id,
Message.message_id_header == riferimento_message_id,
Message.direction == "outbound",
)
)
parent_msg = result.scalar_one_or_none()
if not parent_msg:
logger.debug(
f"Messaggio outbound non trovato per riferimento={riferimento_message_id!r} "
f"(potrebbe essere stato inviato da client diverso)"
)
return None
new_state = apply_outbound_transition(parent_msg.state, pec_type)
if new_state:
old_state = parent_msg.state
parent_msg.state = new_state
parent_msg.updated_at = datetime.now(UTC)
await db.flush()
logger.info(
f"State machine outbound: {riferimento_message_id!r} "
f"{old_state!r}{new_state!r} (ricevuta: {pec_type!r})"
)
return parent_msg.id
async def _save_attachments(
attachments: list,
message: Message,
mailbox: Mailbox,
db: AsyncSession,
) -> None:
"""
Carica gli allegati su MinIO e inserisce i record in tabella attachments.
Args:
attachments: lista di AttachmentInfo dal parser EML
message: messaggio DB a cui appartengono gli allegati
mailbox: casella (per il path MinIO)
db: sessione DB
"""
for att in attachments:
storage_path: str | None = None
try:
storage_path = await upload_attachment(
tenant_id=str(mailbox.tenant_id),
mailbox_id=str(mailbox.id),
message_id=str(message.id),
filename=att.filename,
content=att.content,
content_type=att.content_type,
)
except Exception as e:
logger.error(
f"Upload allegato {att.filename!r} per messaggio {message.id}: {e}"
)
# Continua con gli altri allegati anche se uno fallisce
continue
# Inserisci record in DB
att_record = Attachment(
id=uuid.uuid4(),
tenant_id=message.tenant_id,
message_id=message.id,
filename=att.filename,
content_type=att.content_type,
size_bytes=att.size_bytes,
storage_path=storage_path,
checksum_sha256=att.checksum_sha256,
)
db.add(att_record)
# flush per persistere tutti gli allegati nella transazione corrente
try:
await db.flush()
except Exception as e:
logger.error(f"Errore flush allegati per messaggio {message.id}: {e}")