mirror of
https://github.com/idrainformatica/PecFlow.git
synced 2026-06-16 12:45:42 +02:00
510 lines
15 KiB
Python
510 lines
15 KiB
Python
"""
|
||
Test unitari per app.parsers.eml_parser.
|
||
|
||
Copertura:
|
||
- Parsing header (Subject, From, To, Cc, Message-ID, Date)
|
||
- Decodifica RFC 2047 (UTF-8, ISO-8859-1, base64, quoted-printable)
|
||
- Estrazione body text/plain e text/html
|
||
- Estrazione allegati (singoli e multipli)
|
||
- Gestione EML-in-EML (message/rfc822)
|
||
- Flag has_attachments (solo allegati non-PEC-system)
|
||
- Allegati PEC di sistema (daticert.xml, postacert.eml)
|
||
- EML vuoto / malformato (no crash)
|
||
- Messaggio non-multipart
|
||
"""
|
||
|
||
import email
|
||
import textwrap
|
||
|
||
import pytest
|
||
|
||
from app.parsers.eml_parser import (
|
||
AttachmentInfo,
|
||
ParsedEmail,
|
||
decode_header,
|
||
extract_addresses,
|
||
parse_date,
|
||
parse_eml,
|
||
)
|
||
|
||
|
||
# ─── Fixture EML ──────────────────────────────────────────────────────────────
|
||
|
||
|
||
SIMPLE_EML = b"""\
|
||
From: mittente@pec.it
|
||
To: destinatario@pec.it
|
||
Cc: copia@pec.it
|
||
Subject: Test PEC Fase 3
|
||
Message-ID: <test123@pec.it>
|
||
Date: Wed, 18 Mar 2026 14:00:00 +0100
|
||
Content-Type: text/plain; charset=utf-8
|
||
|
||
Corpo del messaggio di test.
|
||
Seconda riga.
|
||
"""
|
||
|
||
MULTIPART_EML = b"""\
|
||
From: mittente@pec.it
|
||
To: dest@pec.it
|
||
Subject: PEC con allegato
|
||
Date: Wed, 18 Mar 2026 10:00:00 +0100
|
||
Content-Type: multipart/mixed; boundary="====boundary123===="
|
||
|
||
--====boundary123====
|
||
Content-Type: text/plain; charset=utf-8
|
||
|
||
Testo del messaggio.
|
||
|
||
--====boundary123====
|
||
Content-Type: application/pdf; name="documento.pdf"
|
||
Content-Disposition: attachment; filename="documento.pdf"
|
||
Content-Transfer-Encoding: base64
|
||
|
||
JVBERi0xLjQ=
|
||
|
||
--====boundary123====--
|
||
"""
|
||
|
||
MULTIPART_HTML_EML = b"""\
|
||
From: mittente@pec.it
|
||
To: dest@pec.it
|
||
Subject: PEC multipart/alternative
|
||
Date: Wed, 18 Mar 2026 10:00:00 +0100
|
||
Content-Type: multipart/alternative; boundary="====alt===="
|
||
|
||
--====alt====
|
||
Content-Type: text/plain; charset=utf-8
|
||
|
||
Testo piano.
|
||
|
||
--====alt====
|
||
Content-Type: text/html; charset=utf-8
|
||
|
||
<html><body><p>Testo HTML.</p></body></html>
|
||
|
||
--====alt====--
|
||
"""
|
||
|
||
RECEIPT_EML_WITH_NESTED = b"""\
|
||
From: posta-certificata@pec.aruba.it
|
||
To: mittente@pec.it
|
||
Subject: CONSEGNA: Test PEC Fase 3
|
||
X-Ricevuta: avvenuta-consegna
|
||
X-Riferimento-Message-ID: <orig001@pec.it>
|
||
Date: Wed, 18 Mar 2026 14:05:00 +0100
|
||
Content-Type: multipart/mixed; boundary="====receipt===="
|
||
|
||
--====receipt====
|
||
Content-Type: text/plain; charset=utf-8
|
||
|
||
Il messaggio e' stato consegnato al destinatario.
|
||
|
||
--====receipt====
|
||
Content-Type: application/xml; name="daticert.xml"
|
||
Content-Disposition: attachment; filename="daticert.xml"
|
||
|
||
<?xml version="1.0"?><PostaCertificata versione="2.3"></PostaCertificata>
|
||
|
||
--====receipt====
|
||
Content-Type: message/rfc822
|
||
Content-Disposition: inline
|
||
|
||
From: mittente@pec.it
|
||
To: destinatario@pec.it
|
||
Subject: Test PEC Fase 3
|
||
Message-ID: <orig001@pec.it>
|
||
Date: Wed, 18 Mar 2026 14:00:00 +0100
|
||
Content-Type: text/plain; charset=utf-8
|
||
|
||
Corpo del messaggio originale.
|
||
|
||
--====receipt====--
|
||
"""
|
||
|
||
MULTIPLE_ATTACHMENTS_EML = b"""\
|
||
From: a@pec.it
|
||
To: b@pec.it
|
||
Subject: PEC con allegati multipli
|
||
Date: Wed, 18 Mar 2026 10:00:00 +0100
|
||
Content-Type: multipart/mixed; boundary="====multi===="
|
||
|
||
--====multi====
|
||
Content-Type: text/plain; charset=utf-8
|
||
|
||
Corpo.
|
||
|
||
--====multi====
|
||
Content-Type: application/pdf; name="doc1.pdf"
|
||
Content-Disposition: attachment; filename="doc1.pdf"
|
||
Content-Transfer-Encoding: base64
|
||
|
||
AAEC
|
||
|
||
--====multi====
|
||
Content-Type: application/pdf; name="doc2.pdf"
|
||
Content-Disposition: attachment; filename="doc2.pdf"
|
||
Content-Transfer-Encoding: base64
|
||
|
||
BAEC
|
||
|
||
--====multi====--
|
||
"""
|
||
|
||
PEC_SYSTEM_EML = b"""\
|
||
From: posta-certificata@pec.aruba.it
|
||
To: mittente@pec.it
|
||
Subject: ACCETTAZIONE: Test
|
||
X-Ricevuta: accettazione
|
||
Date: Wed, 18 Mar 2026 14:01:00 +0100
|
||
Content-Type: multipart/mixed; boundary="====sys===="
|
||
|
||
--====sys====
|
||
Content-Type: text/plain; charset=utf-8
|
||
|
||
Ricevuta di accettazione.
|
||
|
||
--====sys====
|
||
Content-Type: application/xml; name="daticert.xml"
|
||
Content-Disposition: attachment; filename="daticert.xml"
|
||
|
||
<?xml version="1.0"?><PostaCertificata versione="2.3"></PostaCertificata>
|
||
|
||
--====sys====
|
||
Content-Type: message/rfc822
|
||
Content-Disposition: inline; filename="postacert.eml"
|
||
|
||
From: mittente@pec.it
|
||
To: dest@pec.it
|
||
Subject: Test originale
|
||
|
||
Corpo.
|
||
|
||
--====sys====--
|
||
"""
|
||
|
||
|
||
# ─── Test decode_header ───────────────────────────────────────────────────────
|
||
|
||
|
||
class TestDecodeHeader:
|
||
|
||
def test_stringa_semplice(self):
|
||
assert decode_header("Hello World") == "Hello World"
|
||
|
||
def test_none_ritorna_none(self):
|
||
assert decode_header(None) is None
|
||
|
||
def test_stringa_vuota_ritorna_none(self):
|
||
assert decode_header("") is None
|
||
|
||
def test_utf8_base64(self):
|
||
# "PEC test" in base64 UTF-8
|
||
encoded = "=?utf-8?b?UEVDIHRlc3Q=?="
|
||
assert decode_header(encoded) == "PEC test"
|
||
|
||
def test_iso8859_quoted_printable(self):
|
||
# "Multa n. 123" in QP ISO-8859-1
|
||
encoded = "=?iso-8859-1?q?Multa_n=2E_123?="
|
||
result = decode_header(encoded)
|
||
assert result is not None
|
||
assert "Multa" in result
|
||
assert "123" in result
|
||
|
||
def test_multipart_header(self):
|
||
# Header con più parti encodate
|
||
encoded = "=?utf-8?b?UEVD?= =?utf-8?b?IHRlc3Q=?="
|
||
result = decode_header(encoded)
|
||
assert result is not None
|
||
assert "PEC" in result
|
||
|
||
def test_stringa_gia_decodificata(self):
|
||
assert decode_header("Oggetto normale") == "Oggetto normale"
|
||
|
||
|
||
# ─── Test extract_addresses ───────────────────────────────────────────────────
|
||
|
||
|
||
class TestExtractAddresses:
|
||
|
||
def test_singolo_indirizzo(self):
|
||
addrs = extract_addresses("test@example.com")
|
||
assert "test@example.com" in addrs
|
||
|
||
def test_multipli_indirizzi(self):
|
||
addrs = extract_addresses("a@x.com, b@y.com, c@z.com")
|
||
assert len(addrs) == 3
|
||
assert "a@x.com" in addrs
|
||
assert "b@y.com" in addrs
|
||
assert "c@z.com" in addrs
|
||
|
||
def test_display_name(self):
|
||
addrs = extract_addresses('"Mario Rossi" <mario@comune.it>')
|
||
assert "mario@comune.it" in addrs
|
||
|
||
def test_none_ritorna_lista_vuota(self):
|
||
assert extract_addresses(None) == []
|
||
|
||
def test_stringa_vuota_ritorna_lista_vuota(self):
|
||
assert extract_addresses("") == []
|
||
|
||
|
||
# ─── Test parse_date ──────────────────────────────────────────────────────────
|
||
|
||
|
||
class TestParseDate:
|
||
|
||
def test_data_valida(self):
|
||
d = parse_date("Wed, 18 Mar 2026 14:00:00 +0100")
|
||
assert d is not None
|
||
assert d.year == 2026
|
||
assert d.month == 3
|
||
assert d.day == 18
|
||
|
||
def test_none_ritorna_none(self):
|
||
assert parse_date(None) is None
|
||
|
||
def test_stringa_invalida_ritorna_none(self):
|
||
assert parse_date("non-una-data") is None
|
||
|
||
def test_data_senza_timezone_aggiunge_utc(self):
|
||
d = parse_date("18 Mar 2026 14:00:00 +0000")
|
||
assert d is not None
|
||
assert d.tzinfo is not None
|
||
|
||
|
||
# ─── Test parse_eml – messaggio semplice ──────────────────────────────────────
|
||
|
||
|
||
class TestParseEmlSimple:
|
||
|
||
def test_subject(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert p.subject == "Test PEC Fase 3"
|
||
|
||
def test_from_address(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert p.from_address == "mittente@pec.it"
|
||
|
||
def test_to_addresses(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert "destinatario@pec.it" in p.to_addresses
|
||
|
||
def test_cc_addresses(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert "copia@pec.it" in p.cc_addresses
|
||
|
||
def test_message_id(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert p.message_id == "<test123@pec.it>"
|
||
|
||
def test_date(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert p.date is not None
|
||
assert p.date.year == 2026
|
||
|
||
def test_body_text(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert p.body_text is not None
|
||
assert "Corpo del messaggio" in p.body_text
|
||
assert "Seconda riga" in p.body_text
|
||
|
||
def test_no_html(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert p.body_html is None
|
||
|
||
def test_no_attachments(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert p.attachments == []
|
||
assert p.has_attachments is False
|
||
|
||
def test_raw_message_presente(self):
|
||
p = parse_eml(SIMPLE_EML)
|
||
assert p.raw_message is not None
|
||
assert isinstance(p.raw_message, email.message.Message)
|
||
|
||
|
||
# ─── Test parse_eml – multipart con allegato ──────────────────────────────────
|
||
|
||
|
||
class TestParseEmlMultipart:
|
||
|
||
def test_body_text_estratto(self):
|
||
p = parse_eml(MULTIPART_EML)
|
||
assert p.body_text is not None
|
||
assert "Testo del messaggio" in p.body_text
|
||
|
||
def test_allegato_trovato(self):
|
||
p = parse_eml(MULTIPART_EML)
|
||
assert len(p.attachments) == 1
|
||
att = p.attachments[0]
|
||
assert att.filename == "documento.pdf"
|
||
assert att.content_type == "application/pdf"
|
||
assert att.size_bytes > 0
|
||
assert att.checksum_sha256 is not None
|
||
assert len(att.checksum_sha256) == 64
|
||
|
||
def test_has_attachments_true(self):
|
||
p = parse_eml(MULTIPART_EML)
|
||
assert p.has_attachments is True
|
||
|
||
def test_allegati_multipli(self):
|
||
p = parse_eml(MULTIPLE_ATTACHMENTS_EML)
|
||
filenames = [a.filename for a in p.attachments]
|
||
assert "doc1.pdf" in filenames
|
||
assert "doc2.pdf" in filenames
|
||
assert len(p.attachments) == 2
|
||
|
||
|
||
# ─── Test parse_eml – multipart/alternative ───────────────────────────────────
|
||
|
||
|
||
class TestParseEmlAlternative:
|
||
|
||
def test_body_text_e_html(self):
|
||
p = parse_eml(MULTIPART_HTML_EML)
|
||
assert p.body_text is not None
|
||
assert "Testo piano" in p.body_text
|
||
assert p.body_html is not None
|
||
assert "<html>" in p.body_html
|
||
assert "Testo HTML" in p.body_html
|
||
|
||
def test_no_attachments_in_alternative(self):
|
||
p = parse_eml(MULTIPART_HTML_EML)
|
||
assert p.has_attachments is False
|
||
|
||
|
||
# ─── Test parse_eml – ricevuta con EML-in-EML ────────────────────────────────
|
||
|
||
|
||
class TestParseEmlReceiptWithNested:
|
||
|
||
def test_body_text_ricevuta(self):
|
||
p = parse_eml(RECEIPT_EML_WITH_NESTED)
|
||
assert p.body_text is not None
|
||
assert "consegnato" in p.body_text.lower()
|
||
|
||
def test_allegato_xml_daticert(self):
|
||
p = parse_eml(RECEIPT_EML_WITH_NESTED)
|
||
filenames = [a.filename for a in p.attachments]
|
||
assert "daticert.xml" in filenames
|
||
|
||
def test_allegato_xml_e_pec_system(self):
|
||
p = parse_eml(RECEIPT_EML_WITH_NESTED)
|
||
xml_att = next(a for a in p.attachments if a.filename == "daticert.xml")
|
||
assert xml_att.is_pec_system is True
|
||
|
||
def test_eml_annidato_trovato(self):
|
||
"""Il messaggio originale annidato deve essere presente come allegato."""
|
||
p = parse_eml(RECEIPT_EML_WITH_NESTED)
|
||
eml_atts = [a for a in p.attachments if a.content_type == "message/rfc822"]
|
||
assert len(eml_atts) >= 1
|
||
|
||
def test_has_attachments_false_quando_solo_system(self):
|
||
"""has_attachments deve essere False se ci sono solo allegati PEC di sistema."""
|
||
p = parse_eml(PEC_SYSTEM_EML)
|
||
# daticert.xml e postacert.eml sono entrambi system → has_attachments = False
|
||
assert p.has_attachments is False
|
||
|
||
def test_allegati_sistema_marcati_correttamente(self):
|
||
p = parse_eml(PEC_SYSTEM_EML)
|
||
for att in p.attachments:
|
||
if att.filename in ("daticert.xml", "postacert.eml"):
|
||
assert att.is_pec_system is True, f"{att.filename} dovrebbe essere is_pec_system=True"
|
||
|
||
|
||
# ─── Test parse_eml – edge cases ─────────────────────────────────────────────
|
||
|
||
|
||
class TestParseEmlEdgeCases:
|
||
|
||
def test_eml_vuoto_no_eccezione(self):
|
||
p = parse_eml(b"")
|
||
assert isinstance(p, ParsedEmail)
|
||
assert p.subject is None
|
||
assert p.body_text is None
|
||
assert p.attachments == []
|
||
|
||
def test_eml_malformato_no_eccezione(self):
|
||
p = parse_eml(b"questo non e' un EML valido\x00\xFF")
|
||
assert isinstance(p, ParsedEmail)
|
||
|
||
def test_headers_mancanti(self):
|
||
raw = b"Content-Type: text/plain\r\n\r\nSolo corpo."
|
||
p = parse_eml(raw)
|
||
assert p.subject is None
|
||
assert p.from_address is None
|
||
assert p.to_addresses == []
|
||
|
||
def test_body_con_encoding_windows1252(self):
|
||
raw = (
|
||
b"From: a@pec.it\r\nTo: b@pec.it\r\n"
|
||
b"Content-Type: text/plain; charset=windows-1252\r\n\r\n"
|
||
b"Buonagiornata\xe0 tutti"
|
||
)
|
||
p = parse_eml(raw)
|
||
assert p.body_text is not None
|
||
assert "Buonagiornata" in p.body_text
|
||
|
||
def test_attachments_senza_filename_ignorati(self):
|
||
"""
|
||
Un part senza filename non deve essere aggiunto come allegato
|
||
se non è text/plain o text/html.
|
||
"""
|
||
raw = (
|
||
b"From: a@pec.it\r\nTo: b@pec.it\r\n"
|
||
b'Content-Type: multipart/mixed; boundary="B"\r\n\r\n'
|
||
b"--B\r\nContent-Type: text/plain\r\n\r\nBody\r\n"
|
||
b"--B\r\nContent-Type: application/octet-stream\r\n"
|
||
b"Content-Disposition: attachment\r\n\r\nDATA\r\n"
|
||
b"--B--\r\n"
|
||
)
|
||
p = parse_eml(raw)
|
||
# L'allegato senza filename non deve comparire
|
||
for att in p.attachments:
|
||
assert att.filename is not None and att.filename != ""
|
||
|
||
def test_checksum_sha256_corretto(self):
|
||
"""Il checksum SHA-256 dell'allegato deve essere valido."""
|
||
import hashlib
|
||
p = parse_eml(MULTIPART_EML)
|
||
assert len(p.attachments) == 1
|
||
att = p.attachments[0]
|
||
expected = hashlib.sha256(att.content).hexdigest()
|
||
assert att.checksum_sha256 == expected
|
||
|
||
|
||
# ─── Test AttachmentInfo dataclass ────────────────────────────────────────────
|
||
|
||
|
||
class TestAttachmentInfoDataclass:
|
||
|
||
def test_campi_base(self):
|
||
import hashlib
|
||
content = b"test content"
|
||
att = AttachmentInfo(
|
||
filename="test.pdf",
|
||
content_type="application/pdf",
|
||
content=content,
|
||
size_bytes=len(content),
|
||
checksum_sha256=hashlib.sha256(content).hexdigest(),
|
||
)
|
||
assert att.filename == "test.pdf"
|
||
assert att.content_type == "application/pdf"
|
||
assert att.size_bytes == 12
|
||
assert att.is_inline is False
|
||
assert att.is_pec_system is False
|
||
|
||
def test_inline_flag(self):
|
||
import hashlib
|
||
content = b"img"
|
||
att = AttachmentInfo(
|
||
filename="img.png",
|
||
content_type="image/png",
|
||
content=content,
|
||
size_bytes=len(content),
|
||
checksum_sha256=hashlib.sha256(content).hexdigest(),
|
||
is_inline=True,
|
||
)
|
||
assert att.is_inline is True
|