""" Test unitari per app.parsers.eml_parser. Copertura: - Parsing header (Subject, From, To, Cc, Message-ID, Date) - Decodifica RFC 2047 (UTF-8, ISO-8859-1, base64, quoted-printable) - Estrazione body text/plain e text/html - Estrazione allegati (singoli e multipli) - Gestione EML-in-EML (message/rfc822) - Flag has_attachments (solo allegati non-PEC-system) - Allegati PEC di sistema (daticert.xml, postacert.eml) - EML vuoto / malformato (no crash) - Messaggio non-multipart """ import email import textwrap import pytest from app.parsers.eml_parser import ( AttachmentInfo, ParsedEmail, decode_header, extract_addresses, parse_date, parse_eml, ) # ─── Fixture EML ────────────────────────────────────────────────────────────── SIMPLE_EML = b"""\ From: mittente@pec.it To: destinatario@pec.it Cc: copia@pec.it Subject: Test PEC Fase 3 Message-ID: Date: Wed, 18 Mar 2026 14:00:00 +0100 Content-Type: text/plain; charset=utf-8 Corpo del messaggio di test. Seconda riga. """ MULTIPART_EML = b"""\ From: mittente@pec.it To: dest@pec.it Subject: PEC con allegato Date: Wed, 18 Mar 2026 10:00:00 +0100 Content-Type: multipart/mixed; boundary="====boundary123====" --====boundary123==== Content-Type: text/plain; charset=utf-8 Testo del messaggio. --====boundary123==== Content-Type: application/pdf; name="documento.pdf" Content-Disposition: attachment; filename="documento.pdf" Content-Transfer-Encoding: base64 JVBERi0xLjQ= --====boundary123====-- """ MULTIPART_HTML_EML = b"""\ From: mittente@pec.it To: dest@pec.it Subject: PEC multipart/alternative Date: Wed, 18 Mar 2026 10:00:00 +0100 Content-Type: multipart/alternative; boundary="====alt====" --====alt==== Content-Type: text/plain; charset=utf-8 Testo piano. --====alt==== Content-Type: text/html; charset=utf-8

Testo HTML.

--====alt====-- """ RECEIPT_EML_WITH_NESTED = b"""\ From: posta-certificata@pec.aruba.it To: mittente@pec.it Subject: CONSEGNA: Test PEC Fase 3 X-Ricevuta: avvenuta-consegna X-Riferimento-Message-ID: Date: Wed, 18 Mar 2026 14:05:00 +0100 Content-Type: multipart/mixed; boundary="====receipt====" --====receipt==== Content-Type: text/plain; charset=utf-8 Il messaggio e' stato consegnato al destinatario. --====receipt==== Content-Type: application/xml; name="daticert.xml" Content-Disposition: attachment; filename="daticert.xml" --====receipt==== Content-Type: message/rfc822 Content-Disposition: inline From: mittente@pec.it To: destinatario@pec.it Subject: Test PEC Fase 3 Message-ID: Date: Wed, 18 Mar 2026 14:00:00 +0100 Content-Type: text/plain; charset=utf-8 Corpo del messaggio originale. --====receipt====-- """ MULTIPLE_ATTACHMENTS_EML = b"""\ From: a@pec.it To: b@pec.it Subject: PEC con allegati multipli Date: Wed, 18 Mar 2026 10:00:00 +0100 Content-Type: multipart/mixed; boundary="====multi====" --====multi==== Content-Type: text/plain; charset=utf-8 Corpo. --====multi==== Content-Type: application/pdf; name="doc1.pdf" Content-Disposition: attachment; filename="doc1.pdf" Content-Transfer-Encoding: base64 AAEC --====multi==== Content-Type: application/pdf; name="doc2.pdf" Content-Disposition: attachment; filename="doc2.pdf" Content-Transfer-Encoding: base64 BAEC --====multi====-- """ PEC_SYSTEM_EML = b"""\ From: posta-certificata@pec.aruba.it To: mittente@pec.it Subject: ACCETTAZIONE: Test X-Ricevuta: accettazione Date: Wed, 18 Mar 2026 14:01:00 +0100 Content-Type: multipart/mixed; boundary="====sys====" --====sys==== Content-Type: text/plain; charset=utf-8 Ricevuta di accettazione. --====sys==== Content-Type: application/xml; name="daticert.xml" Content-Disposition: attachment; filename="daticert.xml" --====sys==== Content-Type: message/rfc822 Content-Disposition: inline; filename="postacert.eml" From: mittente@pec.it To: dest@pec.it Subject: Test originale Corpo. --====sys====-- """ # ─── Test decode_header ─────────────────────────────────────────────────────── class TestDecodeHeader: def test_stringa_semplice(self): assert decode_header("Hello World") == "Hello World" def test_none_ritorna_none(self): assert decode_header(None) is None def test_stringa_vuota_ritorna_none(self): assert decode_header("") is None def test_utf8_base64(self): # "PEC test" in base64 UTF-8 encoded = "=?utf-8?b?UEVDIHRlc3Q=?=" assert decode_header(encoded) == "PEC test" def test_iso8859_quoted_printable(self): # "Multa n. 123" in QP ISO-8859-1 encoded = "=?iso-8859-1?q?Multa_n=2E_123?=" result = decode_header(encoded) assert result is not None assert "Multa" in result assert "123" in result def test_multipart_header(self): # Header con più parti encodate encoded = "=?utf-8?b?UEVD?= =?utf-8?b?IHRlc3Q=?=" result = decode_header(encoded) assert result is not None assert "PEC" in result def test_stringa_gia_decodificata(self): assert decode_header("Oggetto normale") == "Oggetto normale" # ─── Test extract_addresses ─────────────────────────────────────────────────── class TestExtractAddresses: def test_singolo_indirizzo(self): addrs = extract_addresses("test@example.com") assert "test@example.com" in addrs def test_multipli_indirizzi(self): addrs = extract_addresses("a@x.com, b@y.com, c@z.com") assert len(addrs) == 3 assert "a@x.com" in addrs assert "b@y.com" in addrs assert "c@z.com" in addrs def test_display_name(self): addrs = extract_addresses('"Mario Rossi" ') assert "mario@comune.it" in addrs def test_none_ritorna_lista_vuota(self): assert extract_addresses(None) == [] def test_stringa_vuota_ritorna_lista_vuota(self): assert extract_addresses("") == [] # ─── Test parse_date ────────────────────────────────────────────────────────── class TestParseDate: def test_data_valida(self): d = parse_date("Wed, 18 Mar 2026 14:00:00 +0100") assert d is not None assert d.year == 2026 assert d.month == 3 assert d.day == 18 def test_none_ritorna_none(self): assert parse_date(None) is None def test_stringa_invalida_ritorna_none(self): assert parse_date("non-una-data") is None def test_data_senza_timezone_aggiunge_utc(self): d = parse_date("18 Mar 2026 14:00:00 +0000") assert d is not None assert d.tzinfo is not None # ─── Test parse_eml – messaggio semplice ────────────────────────────────────── class TestParseEmlSimple: def test_subject(self): p = parse_eml(SIMPLE_EML) assert p.subject == "Test PEC Fase 3" def test_from_address(self): p = parse_eml(SIMPLE_EML) assert p.from_address == "mittente@pec.it" def test_to_addresses(self): p = parse_eml(SIMPLE_EML) assert "destinatario@pec.it" in p.to_addresses def test_cc_addresses(self): p = parse_eml(SIMPLE_EML) assert "copia@pec.it" in p.cc_addresses def test_message_id(self): p = parse_eml(SIMPLE_EML) assert p.message_id == "" def test_date(self): p = parse_eml(SIMPLE_EML) assert p.date is not None assert p.date.year == 2026 def test_body_text(self): p = parse_eml(SIMPLE_EML) assert p.body_text is not None assert "Corpo del messaggio" in p.body_text assert "Seconda riga" in p.body_text def test_no_html(self): p = parse_eml(SIMPLE_EML) assert p.body_html is None def test_no_attachments(self): p = parse_eml(SIMPLE_EML) assert p.attachments == [] assert p.has_attachments is False def test_raw_message_presente(self): p = parse_eml(SIMPLE_EML) assert p.raw_message is not None assert isinstance(p.raw_message, email.message.Message) # ─── Test parse_eml – multipart con allegato ────────────────────────────────── class TestParseEmlMultipart: def test_body_text_estratto(self): p = parse_eml(MULTIPART_EML) assert p.body_text is not None assert "Testo del messaggio" in p.body_text def test_allegato_trovato(self): p = parse_eml(MULTIPART_EML) assert len(p.attachments) == 1 att = p.attachments[0] assert att.filename == "documento.pdf" assert att.content_type == "application/pdf" assert att.size_bytes > 0 assert att.checksum_sha256 is not None assert len(att.checksum_sha256) == 64 def test_has_attachments_true(self): p = parse_eml(MULTIPART_EML) assert p.has_attachments is True def test_allegati_multipli(self): p = parse_eml(MULTIPLE_ATTACHMENTS_EML) filenames = [a.filename for a in p.attachments] assert "doc1.pdf" in filenames assert "doc2.pdf" in filenames assert len(p.attachments) == 2 # ─── Test parse_eml – multipart/alternative ─────────────────────────────────── class TestParseEmlAlternative: def test_body_text_e_html(self): p = parse_eml(MULTIPART_HTML_EML) assert p.body_text is not None assert "Testo piano" in p.body_text assert p.body_html is not None assert "" in p.body_html assert "Testo HTML" in p.body_html def test_no_attachments_in_alternative(self): p = parse_eml(MULTIPART_HTML_EML) assert p.has_attachments is False # ─── Test parse_eml – ricevuta con EML-in-EML ──────────────────────────────── class TestParseEmlReceiptWithNested: def test_body_text_ricevuta(self): p = parse_eml(RECEIPT_EML_WITH_NESTED) assert p.body_text is not None assert "consegnato" in p.body_text.lower() def test_allegato_xml_daticert(self): p = parse_eml(RECEIPT_EML_WITH_NESTED) filenames = [a.filename for a in p.attachments] assert "daticert.xml" in filenames def test_allegato_xml_e_pec_system(self): p = parse_eml(RECEIPT_EML_WITH_NESTED) xml_att = next(a for a in p.attachments if a.filename == "daticert.xml") assert xml_att.is_pec_system is True def test_eml_annidato_trovato(self): """Il messaggio originale annidato deve essere presente come allegato.""" p = parse_eml(RECEIPT_EML_WITH_NESTED) eml_atts = [a for a in p.attachments if a.content_type == "message/rfc822"] assert len(eml_atts) >= 1 def test_has_attachments_false_quando_solo_system(self): """has_attachments deve essere False se ci sono solo allegati PEC di sistema.""" p = parse_eml(PEC_SYSTEM_EML) # daticert.xml e postacert.eml sono entrambi system → has_attachments = False assert p.has_attachments is False def test_allegati_sistema_marcati_correttamente(self): p = parse_eml(PEC_SYSTEM_EML) for att in p.attachments: if att.filename in ("daticert.xml", "postacert.eml"): assert att.is_pec_system is True, f"{att.filename} dovrebbe essere is_pec_system=True" # ─── Test parse_eml – edge cases ───────────────────────────────────────────── class TestParseEmlEdgeCases: def test_eml_vuoto_no_eccezione(self): p = parse_eml(b"") assert isinstance(p, ParsedEmail) assert p.subject is None assert p.body_text is None assert p.attachments == [] def test_eml_malformato_no_eccezione(self): p = parse_eml(b"questo non e' un EML valido\x00\xFF") assert isinstance(p, ParsedEmail) def test_headers_mancanti(self): raw = b"Content-Type: text/plain\r\n\r\nSolo corpo." p = parse_eml(raw) assert p.subject is None assert p.from_address is None assert p.to_addresses == [] def test_body_con_encoding_windows1252(self): raw = ( b"From: a@pec.it\r\nTo: b@pec.it\r\n" b"Content-Type: text/plain; charset=windows-1252\r\n\r\n" b"Buonagiornata\xe0 tutti" ) p = parse_eml(raw) assert p.body_text is not None assert "Buonagiornata" in p.body_text def test_attachments_senza_filename_ignorati(self): """ Un part senza filename non deve essere aggiunto come allegato se non è text/plain o text/html. """ raw = ( b"From: a@pec.it\r\nTo: b@pec.it\r\n" b'Content-Type: multipart/mixed; boundary="B"\r\n\r\n' b"--B\r\nContent-Type: text/plain\r\n\r\nBody\r\n" b"--B\r\nContent-Type: application/octet-stream\r\n" b"Content-Disposition: attachment\r\n\r\nDATA\r\n" b"--B--\r\n" ) p = parse_eml(raw) # L'allegato senza filename non deve comparire for att in p.attachments: assert att.filename is not None and att.filename != "" def test_checksum_sha256_corretto(self): """Il checksum SHA-256 dell'allegato deve essere valido.""" import hashlib p = parse_eml(MULTIPART_EML) assert len(p.attachments) == 1 att = p.attachments[0] expected = hashlib.sha256(att.content).hexdigest() assert att.checksum_sha256 == expected # ─── Test AttachmentInfo dataclass ──────────────────────────────────────────── class TestAttachmentInfoDataclass: def test_campi_base(self): import hashlib content = b"test content" att = AttachmentInfo( filename="test.pdf", content_type="application/pdf", content=content, size_bytes=len(content), checksum_sha256=hashlib.sha256(content).hexdigest(), ) assert att.filename == "test.pdf" assert att.content_type == "application/pdf" assert att.size_bytes == 12 assert att.is_inline is False assert att.is_pec_system is False def test_inline_flag(self): import hashlib content = b"img" att = AttachmentInfo( filename="img.png", content_type="image/png", content=content, size_bytes=len(content), checksum_sha256=hashlib.sha256(content).hexdigest(), is_inline=True, ) assert att.is_inline is True