sync-engine
sync-engine copied to clipboard
Failed to correctly parse ISO-8859-15 encoded FROM header
Hi,
I just got an email with the following headers:
...
Subject: =?ISO-8859-15?Q?Notificaci=F3n=20de=20conexi=F3n=20a=20la=20cuenta=20XXXXXXX?=
X-Ovh-Template: nic/es/loginNotification.model
X-Ovh-Nic: XXXXXXX
Mime-Version: 1.0
Content-Type: text/plain; charset=ISO-8859-15
To: [email protected], [email protected]
Content-Transfer-Encoding: 8bit
From: =?Subject?Q?Atenci=F3n=20al=20cliente=20OVH=20 <[email protected]>,
[email protected]
Message-Id: <[email protected]>
...
Both Subject and From seem ISO-8859-15 encoded, but while Subject get's parsed correctly, From doesn't:
"subject": "Notificación de conexión a la cuenta XXXXXXX"
"from": [
{
"email": "[email protected]",
"name": ""
},
{
"email": "[email protected]",
"name": "=?ISO-8859-15?Q?Atenci=F3n=20al=20cliente=20OVH=20"
}
],
I don't know if the message is malformed (that linebreak is suspicious) but Gmail displays it as follows:

Thanks for the report! That header definitely doesn't look conformant -- RFC2047 says that you can't put space characters inside an encoded-word (i.e., between =? and ?=). But the resultant output is a bit bizarre. We should be able to produce a less strange address list given this type of input.
@emfree where does RFC2047 state this? This is a bug in Python 2.x: http://bugs.python.org/issue1079
We actually backported Python 3.x's decode_header to fix this issue. Let me know if interested in a PR.
@thomasst can you drop a PR for that? :smile:
We use our own email parsing so I haven't actually integrated it with Nylas. Here's the decode_header method you can use though. Simply use that instead of the one built-in to Python. Let me know if you need anything else.
# Below is decode_header backported from latest CPython
# https://github.com/python/cpython/blob/master/Lib/email/header.py
# rev 994d20c1a28bd1b83a0d3034bdc3872667955f5d
# because Python 2.x' decode_header has issues: http://bugs.python.org/issue1079
# Slight changes had to be made since Python 3 has different string types.
import binascii
from email import charset as _charset
from email.errors import HeaderParseError
import email.base64mime
import email.quoprimime
import re
BSPACE = ' '
Charset = _charset.Charset
# Match encoded-word strings in the form =?charset?q?Hello_World?=
ecre = re.compile(r'''
=\? # literal =?
(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
\? # literal ?
(?P<encoding>[qb]) # either a "q" or a "b", case insensitive
\? # literal ?
(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
\?= # literal ?=
''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
def decode_header(header):
"""Decode a message header value without converting charset.
Returns a list of (string, charset) pairs containing each of the decoded
parts of the header. Charset is None for non-encoded parts of the header,
otherwise a lower-case string containing the name of the character set
specified in the encoded string.
header may be a string that may or may not contain RFC2047 encoded words,
or it may be a Header object.
An email.errors.HeaderParseError may be raised when certain decoding error
occurs (e.g. a base64 decoding exception).
"""
header = str(header)
# If it is a Header object, we can just return the encoded chunks.
if hasattr(header, '_chunks'):
return [(_charset._encode(string, str(charset)), str(charset))
for string, charset in header._chunks]
# If no encoding, just return the header with no charset.
if not ecre.search(header):
return [(header, None)]
# First step is to parse all the encoded parts into triplets of the form
# (encoded_string, encoding, charset). For unencoded strings, the last
# two parts will be None.
words = []
for line in header.splitlines():
parts = ecre.split(line)
first = True
while parts:
unencoded = parts.pop(0)
if first:
unencoded = unencoded.lstrip()
first = False
if unencoded:
words.append((unencoded, None, None))
if parts:
charset = parts.pop(0).lower()
encoding = parts.pop(0).lower()
encoded = parts.pop(0)
words.append((encoded, encoding, charset))
# Now loop over words and remove words that consist of whitespace
# between two encoded strings.
droplist = []
for n, w in enumerate(words):
if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
droplist.append(n-1)
for d in reversed(droplist):
del words[d]
# The next step is to decode each encoded word by applying the reverse
# base64 or quopri transformation. decoded_words is now a list of the
# form (decoded_word, charset).
decoded_words = []
for encoded_string, encoding, charset in words:
if encoding is None:
# This is an unencoded word.
decoded_words.append((encoded_string, charset))
elif encoding == 'q':
word = email.quoprimime.header_decode(encoded_string)
decoded_words.append((word, charset))
elif encoding == 'b':
paderr = len(encoded_string) % 4 # Postel's law: add missing padding
if paderr:
encoded_string += '==='[:4 - paderr]
try:
word = email.base64mime.decode(encoded_string)
except binascii.Error:
raise HeaderParseError('Base64 decoding error')
else:
decoded_words.append((word, charset))
else:
raise AssertionError('Unexpected encoding: ' + encoding)
# Now convert all words to bytes and collapse consecutive runs of
# similarly encoded words.
collapsed = []
last_word = last_charset = None
for word, charset in decoded_words:
if isinstance(word, unicode):
word = word.encode('utf8')
if last_word is None:
last_word = word
last_charset = charset
elif charset != last_charset:
collapsed.append((last_word, last_charset))
last_word = word
last_charset = charset
elif last_charset is None:
last_word += BSPACE + word
else:
last_word += word
collapsed.append((last_word, last_charset))
return collapsed