Skip to content

Commit 6ec5645

Browse files
committed
bpo-37093: Allow http.client to parse non-ASCII header names
Previously, when http.client tried to parse a response from an out-of-spec server that sent a header with a non-ASCII name, email.feedparser would assume that the non-compliant header must be part of a message body and abort parsing. However, http.client already determined the boundary between headers and body and only passed the headers to the parser. As a result, any headers after the first non-compliant one would be silently (!) ignored. This could include headers important for message framing like Content-Length and Transfer-Encoding. In the long-long ago, this parsing was handled by the rfc822 module, which didn't care about which bytes were in the header as long as there was a colon in the line. Now, add an optional argument to the email parsers to decide whether to require strict RFC-compliant header names. Default this to True to minimize the possibility of breaking other callers. In http.client, which already knows where the headers end and body begins, use False. Note that the non-ASCII names will be decoded as ISO-8859-1 in keeping with how header values are decoded.
1 parent 60f3c39 commit 6ec5645

6 files changed

Lines changed: 115 additions & 37 deletions

File tree

Lib/email/feedparser.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,16 @@ def __next__(self):
136136
class FeedParser:
137137
"""A feed-style parser of email."""
138138

139-
def __init__(self, _factory=None, *, policy=compat32):
139+
def __init__(self, _factory=None, *, policy=compat32, strictheaders=True):
140140
"""_factory is called with no arguments to create a new message obj
141141
142142
The policy keyword specifies a policy object that controls a number of
143143
aspects of the parser's operation. The default policy maintains
144144
backward compatibility.
145145
146+
The strictheaders keyword specifies whether to require RFC-compliant
147+
header names.
148+
146149
"""
147150
self.policy = policy
148151
self._old_style_factory = False
@@ -165,6 +168,7 @@ def __init__(self, _factory=None, *, policy=compat32):
165168
self._cur = None
166169
self._last = None
167170
self._headersonly = False
171+
self._strictheaders = strictheaders
168172

169173
# Non-public interface for supporting Parser's headersonly flag
170174
def _set_headersonly(self):
@@ -225,14 +229,15 @@ def _parsegen(self):
225229
if line is NeedMoreData:
226230
yield NeedMoreData
227231
continue
228-
if not headerRE.match(line):
232+
if NLCRE.match(line):
233+
break
234+
elif not headerRE.match(line) and self._strictheaders:
229235
# If we saw the RFC defined header/body separator
230236
# (i.e. newline), just throw it away. Otherwise the line is
231237
# part of the body so push it back.
232-
if not NLCRE.match(line):
233-
defect = errors.MissingHeaderBodySeparatorDefect()
234-
self.policy.handle_defect(self._cur, defect)
235-
self._input.unreadline(line)
238+
defect = errors.MissingHeaderBodySeparatorDefect()
239+
self.policy.handle_defect(self._cur, defect)
240+
self._input.unreadline(line)
236241
break
237242
headers.append(line)
238243
# Done with the headers, so parse them and figure out what we're

Lib/email/parser.py

Lines changed: 58 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -38,38 +38,53 @@ def __init__(self, _class=None, *, policy=compat32):
3838
self._class = _class
3939
self.policy = policy
4040

41-
def parse(self, fp, headersonly=False):
41+
def parse(self, fp, headersonly=False, strictheaders=True):
4242
"""Create a message structure from the data in a file.
4343
4444
Reads all the data from the file and returns the root of the message
45-
structure. Optional headersonly is a flag specifying whether to stop
46-
parsing after reading the headers or not. The default is False,
47-
meaning it parses the entire contents of the file.
45+
structure.
46+
47+
Optional headersonly is a flag specifying whether to stop parsing
48+
after reading the headers or not. The default is False, meaning it
49+
parses the entire contents of the file.
50+
51+
Optional strictheaders is a flag specifying whether to require
52+
RFC-compliant header names. The default is True, causing parsing to
53+
abort if non-compliant header names are encountered.
4854
"""
49-
feedparser = FeedParser(self._class, policy=self.policy)
55+
feedparser = FeedParser(
56+
self._class, policy=self.policy, strictheaders=strictheaders)
5057
if headersonly:
5158
feedparser._set_headersonly()
5259
while data := fp.read(8192):
5360
feedparser.feed(data)
5461
return feedparser.close()
5562

56-
def parsestr(self, text, headersonly=False):
63+
def parsestr(self, text, headersonly=False, strictheaders=True):
5764
"""Create a message structure from a string.
5865
59-
Returns the root of the message structure. Optional headersonly is a
60-
flag specifying whether to stop parsing after reading the headers or
61-
not. The default is False, meaning it parses the entire contents of
62-
the file.
66+
Returns the root of the message structure.
67+
68+
Optional headersonly is a flag specifying whether to stop parsing
69+
after reading the headers or not. The default is False, meaning it
70+
parses the entire contents of the file.
71+
72+
Optional strictheaders is a flag specifying whether to require
73+
RFC-compliant header names. The default is True, causing parsing to
74+
abort if non-compliant header names are encountered.
6375
"""
64-
return self.parse(StringIO(text), headersonly=headersonly)
76+
return self.parse(
77+
StringIO(text),
78+
headersonly=headersonly,
79+
strictheaders=strictheaders)
6580

6681

6782
class HeaderParser(Parser):
68-
def parse(self, fp, headersonly=True):
69-
return Parser.parse(self, fp, True)
83+
def parse(self, fp, headersonly=True, strictheaders=True):
84+
return Parser.parse(self, fp, True, strictheaders=strictheaders)
7085

71-
def parsestr(self, text, headersonly=True):
72-
return Parser.parsestr(self, text, True)
86+
def parsestr(self, text, headersonly=True, strictheaders=True):
87+
return Parser.parsestr(self, text, True, strictheaders=strictheaders)
7388

7489

7590
class BytesParser:
@@ -92,36 +107,49 @@ def __init__(self, *args, **kw):
92107
"""
93108
self.parser = Parser(*args, **kw)
94109

95-
def parse(self, fp, headersonly=False):
110+
def parse(self, fp, headersonly=False, strictheaders=True):
96111
"""Create a message structure from the data in a binary file.
97112
98113
Reads all the data from the file and returns the root of the message
99-
structure. Optional headersonly is a flag specifying whether to stop
100-
parsing after reading the headers or not. The default is False,
101-
meaning it parses the entire contents of the file.
114+
structure.
115+
116+
Optional headersonly is a flag specifying whether to stop parsing
117+
after reading the headers or not. The default is False, meaning it
118+
parses the entire contents of the file.
119+
120+
Optional strictheaders is a flag specifying whether to require
121+
RFC-compliant header names. The default is True, causing parsing to
122+
abort if non-compliant header names are encountered.
102123
"""
103124
fp = TextIOWrapper(fp, encoding='ascii', errors='surrogateescape')
104125
try:
105-
return self.parser.parse(fp, headersonly)
126+
return self.parser.parse(fp, headersonly, strictheaders)
106127
finally:
107128
fp.detach()
108129

109130

110-
def parsebytes(self, text, headersonly=False):
131+
def parsebytes(self, text, headersonly=False, strictheaders=True):
111132
"""Create a message structure from a byte string.
112133
113-
Returns the root of the message structure. Optional headersonly is a
114-
flag specifying whether to stop parsing after reading the headers or
115-
not. The default is False, meaning it parses the entire contents of
116-
the file.
134+
Returns the root of the message structure.
135+
136+
Optional headersonly is a flag specifying whether to stop parsing
137+
after reading the headers or not. The default is False, meaning it
138+
parses the entire contents of the file.
139+
140+
Optional strictheaders is a flag specifying whether to require
141+
RFC-compliant header names. The default is True, causing parsing to
142+
abort if non-compliant header names are encountered.
117143
"""
118144
text = text.decode('ASCII', errors='surrogateescape')
119-
return self.parser.parsestr(text, headersonly)
145+
return self.parser.parsestr(text, headersonly, strictheaders)
120146

121147

122148
class BytesHeaderParser(BytesParser):
123-
def parse(self, fp, headersonly=True):
124-
return BytesParser.parse(self, fp, headersonly=True)
149+
def parse(self, fp, headersonly=True, strictheaders=True):
150+
return BytesParser.parse(
151+
self, fp, headersonly=True, strictheaders=strictheaders)
125152

126-
def parsebytes(self, text, headersonly=True):
127-
return BytesParser.parsebytes(self, text, headersonly=True)
153+
def parsebytes(self, text, headersonly=True, strictheaders=True):
154+
return BytesParser.parsebytes(
155+
self, text, headersonly=True, strictheaders=strictheaders)

Lib/http/client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,8 @@ def _parse_header_lines(header_lines, _class=HTTPMessage):
246246
247247
"""
248248
hstring = b''.join(header_lines).decode('iso-8859-1')
249-
return email.parser.Parser(_class=_class).parsestr(hstring)
249+
return email.parser.Parser(_class=_class).parsestr(
250+
hstring, headersonly=True, strictheaders=False)
250251

251252
def parse_headers(fp, _class=HTTPMessage, *, _max_headers=None):
252253
"""Parses only RFC 5322 headers from a file pointer."""

Lib/test/test_httplib.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,48 @@ def test_parse_all_octets(self):
339339
self.assertIn(' folded with space', folded)
340340
self.assertEndsWith(folded, 'folded with tab')
341341

342+
def test_parse_invalid_octets(self):
343+
# Ensure no valid header field octet breaks the parser
344+
body = (
345+
b'HTTP/1.1 200 OK\r\n'
346+
b'A-Canonical: header\r\n'
347+
b'sOMe-CRazY: heADers\r\n'
348+
b'With-UTF-8-Values: \xe2\x9c\x94\r\n'
349+
b'And-Even-\xf0\x9f\x8c\xb4: names\r\n'
350+
b'and-more: after that\r\n'
351+
b'Transfer-Encoding: chunked\r\n'
352+
b'\r\n' # End of headers
353+
b'e\r\n'
354+
b'Hello, world!\n\r\n'
355+
b'0\r\n'
356+
b'\r\n'
357+
)
358+
sock = FakeSocket(body)
359+
resp = client.HTTPResponse(sock)
360+
resp.begin()
361+
# Even if there was a charset specified in the Content-Type, that
362+
# would only apply to the actual body. Interpret the out-of-spec
363+
# response headers as ISO-8859-1, which is consistent with the
364+
# encoding used for header values and how one would generate such
365+
# a response from a WSGI server.
366+
expected = {
367+
'A-Canonical': 'header',
368+
'sOMe-CRazY': 'heADers',
369+
'With-UTF-8-Values': '\xe2\x9c\x94',
370+
'And-Even-\xf0\x9f\x8c\xb4': 'names',
371+
'and-more': 'after that',
372+
'Transfer-Encoding': 'chunked',
373+
}
374+
case_sensitive = dict(resp.getheaders())
375+
for header, value in expected.items():
376+
with self.subTest((header, value)):
377+
self.assertEqual(resp.getheader(header), value)
378+
self.assertEqual(resp.msg[header], value)
379+
self.assertIn(header, case_sensitive)
380+
self.assertNotIn(header.upper(), case_sensitive)
381+
self.assertEqual(resp.read(), b'Hello, world!\n')
382+
self.assertEqual(resp.headers.get_payload(), '')
383+
342384
def test_invalid_headers(self):
343385
conn = client.HTTPConnection('example.com')
344386
conn.sock = FakeSocket('')

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ Artem Bulgakov
268268
Dick Bulterman
269269
Bill Bumgarner
270270
Jimmy Burgett
271+
Tim Burke
271272
Charles Burkland
272273
Edmond Burnett
273274
Tommy Burnette
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:mod:`http.client` now parses non-ASCII header names.

0 commit comments

Comments
 (0)