Files
PecHub/worker/tests/unit/test_eml_parser.py
T
2026-03-18 17:43:03 +01:00

510 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Test unitari per app.parsers.eml_parser.
Copertura:
- Parsing header (Subject, From, To, Cc, Message-ID, Date)
- Decodifica RFC 2047 (UTF-8, ISO-8859-1, base64, quoted-printable)
- Estrazione body text/plain e text/html
- Estrazione allegati (singoli e multipli)
- Gestione EML-in-EML (message/rfc822)
- Flag has_attachments (solo allegati non-PEC-system)
- Allegati PEC di sistema (daticert.xml, postacert.eml)
- EML vuoto / malformato (no crash)
- Messaggio non-multipart
"""
import email
import textwrap
import pytest
from app.parsers.eml_parser import (
AttachmentInfo,
ParsedEmail,
decode_header,
extract_addresses,
parse_date,
parse_eml,
)
# ─── Fixture EML ──────────────────────────────────────────────────────────────
SIMPLE_EML = b"""\
From: mittente@pec.it
To: destinatario@pec.it
Cc: copia@pec.it
Subject: Test PEC Fase 3
Message-ID: <test123@pec.it>
Date: Wed, 18 Mar 2026 14:00:00 +0100
Content-Type: text/plain; charset=utf-8
Corpo del messaggio di test.
Seconda riga.
"""
MULTIPART_EML = b"""\
From: mittente@pec.it
To: dest@pec.it
Subject: PEC con allegato
Date: Wed, 18 Mar 2026 10:00:00 +0100
Content-Type: multipart/mixed; boundary="====boundary123===="
--====boundary123====
Content-Type: text/plain; charset=utf-8
Testo del messaggio.
--====boundary123====
Content-Type: application/pdf; name="documento.pdf"
Content-Disposition: attachment; filename="documento.pdf"
Content-Transfer-Encoding: base64
JVBERi0xLjQ=
--====boundary123====--
"""
MULTIPART_HTML_EML = b"""\
From: mittente@pec.it
To: dest@pec.it
Subject: PEC multipart/alternative
Date: Wed, 18 Mar 2026 10:00:00 +0100
Content-Type: multipart/alternative; boundary="====alt===="
--====alt====
Content-Type: text/plain; charset=utf-8
Testo piano.
--====alt====
Content-Type: text/html; charset=utf-8
<html><body><p>Testo HTML.</p></body></html>
--====alt====--
"""
RECEIPT_EML_WITH_NESTED = b"""\
From: posta-certificata@pec.aruba.it
To: mittente@pec.it
Subject: CONSEGNA: Test PEC Fase 3
X-Ricevuta: avvenuta-consegna
X-Riferimento-Message-ID: <orig001@pec.it>
Date: Wed, 18 Mar 2026 14:05:00 +0100
Content-Type: multipart/mixed; boundary="====receipt===="
--====receipt====
Content-Type: text/plain; charset=utf-8
Il messaggio e' stato consegnato al destinatario.
--====receipt====
Content-Type: application/xml; name="daticert.xml"
Content-Disposition: attachment; filename="daticert.xml"
<?xml version="1.0"?><PostaCertificata versione="2.3"></PostaCertificata>
--====receipt====
Content-Type: message/rfc822
Content-Disposition: inline
From: mittente@pec.it
To: destinatario@pec.it
Subject: Test PEC Fase 3
Message-ID: <orig001@pec.it>
Date: Wed, 18 Mar 2026 14:00:00 +0100
Content-Type: text/plain; charset=utf-8
Corpo del messaggio originale.
--====receipt====--
"""
MULTIPLE_ATTACHMENTS_EML = b"""\
From: a@pec.it
To: b@pec.it
Subject: PEC con allegati multipli
Date: Wed, 18 Mar 2026 10:00:00 +0100
Content-Type: multipart/mixed; boundary="====multi===="
--====multi====
Content-Type: text/plain; charset=utf-8
Corpo.
--====multi====
Content-Type: application/pdf; name="doc1.pdf"
Content-Disposition: attachment; filename="doc1.pdf"
Content-Transfer-Encoding: base64
AAEC
--====multi====
Content-Type: application/pdf; name="doc2.pdf"
Content-Disposition: attachment; filename="doc2.pdf"
Content-Transfer-Encoding: base64
BAEC
--====multi====--
"""
PEC_SYSTEM_EML = b"""\
From: posta-certificata@pec.aruba.it
To: mittente@pec.it
Subject: ACCETTAZIONE: Test
X-Ricevuta: accettazione
Date: Wed, 18 Mar 2026 14:01:00 +0100
Content-Type: multipart/mixed; boundary="====sys===="
--====sys====
Content-Type: text/plain; charset=utf-8
Ricevuta di accettazione.
--====sys====
Content-Type: application/xml; name="daticert.xml"
Content-Disposition: attachment; filename="daticert.xml"
<?xml version="1.0"?><PostaCertificata versione="2.3"></PostaCertificata>
--====sys====
Content-Type: message/rfc822
Content-Disposition: inline; filename="postacert.eml"
From: mittente@pec.it
To: dest@pec.it
Subject: Test originale
Corpo.
--====sys====--
"""
# ─── Test decode_header ───────────────────────────────────────────────────────
class TestDecodeHeader:
def test_stringa_semplice(self):
assert decode_header("Hello World") == "Hello World"
def test_none_ritorna_none(self):
assert decode_header(None) is None
def test_stringa_vuota_ritorna_none(self):
assert decode_header("") is None
def test_utf8_base64(self):
# "PEC test" in base64 UTF-8
encoded = "=?utf-8?b?UEVDIHRlc3Q=?="
assert decode_header(encoded) == "PEC test"
def test_iso8859_quoted_printable(self):
# "Multa n. 123" in QP ISO-8859-1
encoded = "=?iso-8859-1?q?Multa_n=2E_123?="
result = decode_header(encoded)
assert result is not None
assert "Multa" in result
assert "123" in result
def test_multipart_header(self):
# Header con più parti encodate
encoded = "=?utf-8?b?UEVD?= =?utf-8?b?IHRlc3Q=?="
result = decode_header(encoded)
assert result is not None
assert "PEC" in result
def test_stringa_gia_decodificata(self):
assert decode_header("Oggetto normale") == "Oggetto normale"
# ─── Test extract_addresses ───────────────────────────────────────────────────
class TestExtractAddresses:
def test_singolo_indirizzo(self):
addrs = extract_addresses("test@example.com")
assert "test@example.com" in addrs
def test_multipli_indirizzi(self):
addrs = extract_addresses("a@x.com, b@y.com, c@z.com")
assert len(addrs) == 3
assert "a@x.com" in addrs
assert "b@y.com" in addrs
assert "c@z.com" in addrs
def test_display_name(self):
addrs = extract_addresses('"Mario Rossi" <mario@comune.it>')
assert "mario@comune.it" in addrs
def test_none_ritorna_lista_vuota(self):
assert extract_addresses(None) == []
def test_stringa_vuota_ritorna_lista_vuota(self):
assert extract_addresses("") == []
# ─── Test parse_date ──────────────────────────────────────────────────────────
class TestParseDate:
def test_data_valida(self):
d = parse_date("Wed, 18 Mar 2026 14:00:00 +0100")
assert d is not None
assert d.year == 2026
assert d.month == 3
assert d.day == 18
def test_none_ritorna_none(self):
assert parse_date(None) is None
def test_stringa_invalida_ritorna_none(self):
assert parse_date("non-una-data") is None
def test_data_senza_timezone_aggiunge_utc(self):
d = parse_date("18 Mar 2026 14:00:00 +0000")
assert d is not None
assert d.tzinfo is not None
# ─── Test parse_eml messaggio semplice ──────────────────────────────────────
class TestParseEmlSimple:
def test_subject(self):
p = parse_eml(SIMPLE_EML)
assert p.subject == "Test PEC Fase 3"
def test_from_address(self):
p = parse_eml(SIMPLE_EML)
assert p.from_address == "mittente@pec.it"
def test_to_addresses(self):
p = parse_eml(SIMPLE_EML)
assert "destinatario@pec.it" in p.to_addresses
def test_cc_addresses(self):
p = parse_eml(SIMPLE_EML)
assert "copia@pec.it" in p.cc_addresses
def test_message_id(self):
p = parse_eml(SIMPLE_EML)
assert p.message_id == "<test123@pec.it>"
def test_date(self):
p = parse_eml(SIMPLE_EML)
assert p.date is not None
assert p.date.year == 2026
def test_body_text(self):
p = parse_eml(SIMPLE_EML)
assert p.body_text is not None
assert "Corpo del messaggio" in p.body_text
assert "Seconda riga" in p.body_text
def test_no_html(self):
p = parse_eml(SIMPLE_EML)
assert p.body_html is None
def test_no_attachments(self):
p = parse_eml(SIMPLE_EML)
assert p.attachments == []
assert p.has_attachments is False
def test_raw_message_presente(self):
p = parse_eml(SIMPLE_EML)
assert p.raw_message is not None
assert isinstance(p.raw_message, email.message.Message)
# ─── Test parse_eml multipart con allegato ──────────────────────────────────
class TestParseEmlMultipart:
def test_body_text_estratto(self):
p = parse_eml(MULTIPART_EML)
assert p.body_text is not None
assert "Testo del messaggio" in p.body_text
def test_allegato_trovato(self):
p = parse_eml(MULTIPART_EML)
assert len(p.attachments) == 1
att = p.attachments[0]
assert att.filename == "documento.pdf"
assert att.content_type == "application/pdf"
assert att.size_bytes > 0
assert att.checksum_sha256 is not None
assert len(att.checksum_sha256) == 64
def test_has_attachments_true(self):
p = parse_eml(MULTIPART_EML)
assert p.has_attachments is True
def test_allegati_multipli(self):
p = parse_eml(MULTIPLE_ATTACHMENTS_EML)
filenames = [a.filename for a in p.attachments]
assert "doc1.pdf" in filenames
assert "doc2.pdf" in filenames
assert len(p.attachments) == 2
# ─── Test parse_eml multipart/alternative ───────────────────────────────────
class TestParseEmlAlternative:
def test_body_text_e_html(self):
p = parse_eml(MULTIPART_HTML_EML)
assert p.body_text is not None
assert "Testo piano" in p.body_text
assert p.body_html is not None
assert "<html>" in p.body_html
assert "Testo HTML" in p.body_html
def test_no_attachments_in_alternative(self):
p = parse_eml(MULTIPART_HTML_EML)
assert p.has_attachments is False
# ─── Test parse_eml ricevuta con EML-in-EML ────────────────────────────────
class TestParseEmlReceiptWithNested:
def test_body_text_ricevuta(self):
p = parse_eml(RECEIPT_EML_WITH_NESTED)
assert p.body_text is not None
assert "consegnato" in p.body_text.lower()
def test_allegato_xml_daticert(self):
p = parse_eml(RECEIPT_EML_WITH_NESTED)
filenames = [a.filename for a in p.attachments]
assert "daticert.xml" in filenames
def test_allegato_xml_e_pec_system(self):
p = parse_eml(RECEIPT_EML_WITH_NESTED)
xml_att = next(a for a in p.attachments if a.filename == "daticert.xml")
assert xml_att.is_pec_system is True
def test_eml_annidato_trovato(self):
"""Il messaggio originale annidato deve essere presente come allegato."""
p = parse_eml(RECEIPT_EML_WITH_NESTED)
eml_atts = [a for a in p.attachments if a.content_type == "message/rfc822"]
assert len(eml_atts) >= 1
def test_has_attachments_false_quando_solo_system(self):
"""has_attachments deve essere False se ci sono solo allegati PEC di sistema."""
p = parse_eml(PEC_SYSTEM_EML)
# daticert.xml e postacert.eml sono entrambi system → has_attachments = False
assert p.has_attachments is False
def test_allegati_sistema_marcati_correttamente(self):
p = parse_eml(PEC_SYSTEM_EML)
for att in p.attachments:
if att.filename in ("daticert.xml", "postacert.eml"):
assert att.is_pec_system is True, f"{att.filename} dovrebbe essere is_pec_system=True"
# ─── Test parse_eml edge cases ─────────────────────────────────────────────
class TestParseEmlEdgeCases:
def test_eml_vuoto_no_eccezione(self):
p = parse_eml(b"")
assert isinstance(p, ParsedEmail)
assert p.subject is None
assert p.body_text is None
assert p.attachments == []
def test_eml_malformato_no_eccezione(self):
p = parse_eml(b"questo non e' un EML valido\x00\xFF")
assert isinstance(p, ParsedEmail)
def test_headers_mancanti(self):
raw = b"Content-Type: text/plain\r\n\r\nSolo corpo."
p = parse_eml(raw)
assert p.subject is None
assert p.from_address is None
assert p.to_addresses == []
def test_body_con_encoding_windows1252(self):
raw = (
b"From: a@pec.it\r\nTo: b@pec.it\r\n"
b"Content-Type: text/plain; charset=windows-1252\r\n\r\n"
b"Buonagiornata\xe0 tutti"
)
p = parse_eml(raw)
assert p.body_text is not None
assert "Buonagiornata" in p.body_text
def test_attachments_senza_filename_ignorati(self):
"""
Un part senza filename non deve essere aggiunto come allegato
se non è text/plain o text/html.
"""
raw = (
b"From: a@pec.it\r\nTo: b@pec.it\r\n"
b'Content-Type: multipart/mixed; boundary="B"\r\n\r\n'
b"--B\r\nContent-Type: text/plain\r\n\r\nBody\r\n"
b"--B\r\nContent-Type: application/octet-stream\r\n"
b"Content-Disposition: attachment\r\n\r\nDATA\r\n"
b"--B--\r\n"
)
p = parse_eml(raw)
# L'allegato senza filename non deve comparire
for att in p.attachments:
assert att.filename is not None and att.filename != ""
def test_checksum_sha256_corretto(self):
"""Il checksum SHA-256 dell'allegato deve essere valido."""
import hashlib
p = parse_eml(MULTIPART_EML)
assert len(p.attachments) == 1
att = p.attachments[0]
expected = hashlib.sha256(att.content).hexdigest()
assert att.checksum_sha256 == expected
# ─── Test AttachmentInfo dataclass ────────────────────────────────────────────
class TestAttachmentInfoDataclass:
def test_campi_base(self):
import hashlib
content = b"test content"
att = AttachmentInfo(
filename="test.pdf",
content_type="application/pdf",
content=content,
size_bytes=len(content),
checksum_sha256=hashlib.sha256(content).hexdigest(),
)
assert att.filename == "test.pdf"
assert att.content_type == "application/pdf"
assert att.size_bytes == 12
assert att.is_inline is False
assert att.is_pec_system is False
def test_inline_flag(self):
import hashlib
content = b"img"
att = AttachmentInfo(
filename="img.png",
content_type="image/png",
content=content,
size_bytes=len(content),
checksum_sha256=hashlib.sha256(content).hexdigest(),
is_inline=True,
)
assert att.is_inline is True