Skip to content

Commit 442dd62

Browse files
committed
bpo-37093: Allow http.client to parse non-ASCII header names
Previously, when http.client tried to parse a response from an out-of-spec server that sent a header with a non-ASCII name, email.feedparser would assume that the non-compliant header must be part of a message body and abort parsing. However, http.client already determined the boundary between headers and body and only passed the headers to the parser. As a result, any headers after the first non-compliant one would be silently (!) ignored. This could include headers important for message framing like Content-Length and Transfer-Encoding. In the long-long ago, this parsing was handled by the rfc822 module, which didn't care about which bytes were in the header as long as there was a colon in the line. Now, add an optional argument to the email parsers to decide whether to require strict RFC-compliant header names. Default this to True to minimize the possibility of breaking other callers. In http.client, which already knows where the headers end and body begins, use False. Note that the non-ASCII names will be decoded as ISO-8859-1 in keeping with how header values are decoded.
1 parent bf95ff9 commit 442dd62

6 files changed

Lines changed: 76 additions & 23 deletions

File tree

Lib/email/feedparser.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def __next__(self):
136136
class FeedParser:
137137
"""A feed-style parser of email."""
138138

139-
def __init__(self, _factory=None, *, policy=compat32):
139+
def __init__(self, _factory=None, *, policy=compat32, strictheaders=True):
140140
"""_factory is called with no arguments to create a new message obj
141141
142142
The policy keyword specifies a policy object that controls a number of
@@ -165,6 +165,7 @@ def __init__(self, _factory=None, *, policy=compat32):
165165
self._cur = None
166166
self._last = None
167167
self._headersonly = False
168+
self._strictheaders = strictheaders
168169

169170
# Non-public interface for supporting Parser's headersonly flag
170171
def _set_headersonly(self):
@@ -225,14 +226,15 @@ def _parsegen(self):
225226
if line is NeedMoreData:
226227
yield NeedMoreData
227228
continue
228-
if not headerRE.match(line):
229+
if NLCRE.match(line):
230+
break
231+
elif not headerRE.match(line) and self._strictheaders:
229232
# If we saw the RFC defined header/body separator
230233
# (i.e. newline), just throw it away. Otherwise the line is
231234
# part of the body so push it back.
232-
if not NLCRE.match(line):
233-
defect = errors.MissingHeaderBodySeparatorDefect()
234-
self.policy.handle_defect(self._cur, defect)
235-
self._input.unreadline(line)
235+
defect = errors.MissingHeaderBodySeparatorDefect()
236+
self.policy.handle_defect(self._cur, defect)
237+
self._input.unreadline(line)
236238
break
237239
headers.append(line)
238240
# Done with the headers, so parse them and figure out what we're

Lib/email/parser.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,16 @@ def __init__(self, _class=None, *, policy=compat32):
3838
self._class = _class
3939
self.policy = policy
4040

41-
def parse(self, fp, headersonly=False):
41+
def parse(self, fp, headersonly=False, strictheaders=True):
4242
"""Create a message structure from the data in a file.
4343
4444
Reads all the data from the file and returns the root of the message
4545
structure. Optional headersonly is a flag specifying whether to stop
4646
parsing after reading the headers or not. The default is False,
4747
meaning it parses the entire contents of the file.
4848
"""
49-
feedparser = FeedParser(self._class, policy=self.policy)
49+
feedparser = FeedParser(
50+
self._class, policy=self.policy, strictheaders=strictheaders)
5051
if headersonly:
5152
feedparser._set_headersonly()
5253
while True:
@@ -56,24 +57,27 @@ def parse(self, fp, headersonly=False):
5657
feedparser.feed(data)
5758
return feedparser.close()
5859

59-
def parsestr(self, text, headersonly=False):
60+
def parsestr(self, text, headersonly=False, strictheaders=True):
6061
"""Create a message structure from a string.
6162
6263
Returns the root of the message structure. Optional headersonly is a
6364
flag specifying whether to stop parsing after reading the headers or
6465
not. The default is False, meaning it parses the entire contents of
6566
the file.
6667
"""
67-
return self.parse(StringIO(text), headersonly=headersonly)
68+
return self.parse(
69+
StringIO(text),
70+
headersonly=headersonly,
71+
strictheaders=strictheaders)
6872

6973

7074

7175
class HeaderParser(Parser):
72-
def parse(self, fp, headersonly=True):
73-
return Parser.parse(self, fp, True)
76+
def parse(self, fp, headersonly=True, strictheaders=True):
77+
return Parser.parse(self, fp, True, strictheaders=strictheaders)
7478

75-
def parsestr(self, text, headersonly=True):
76-
return Parser.parsestr(self, text, True)
79+
def parsestr(self, text, headersonly=True, strictheaders=True):
80+
return Parser.parsestr(self, text, True, strictheaders=strictheaders)
7781

7882

7983
class BytesParser:
@@ -96,7 +100,7 @@ def __init__(self, *args, **kw):
96100
"""
97101
self.parser = Parser(*args, **kw)
98102

99-
def parse(self, fp, headersonly=False):
103+
def parse(self, fp, headersonly=False, strictheaders=True):
100104
"""Create a message structure from the data in a binary file.
101105
102106
Reads all the data from the file and returns the root of the message
@@ -106,12 +110,12 @@ def parse(self, fp, headersonly=False):
106110
"""
107111
fp = TextIOWrapper(fp, encoding='ascii', errors='surrogateescape')
108112
try:
109-
return self.parser.parse(fp, headersonly)
113+
return self.parser.parse(fp, headersonly, strictheaders)
110114
finally:
111115
fp.detach()
112116

113117

114-
def parsebytes(self, text, headersonly=False):
118+
def parsebytes(self, text, headersonly=False, strictheaders=True):
115119
"""Create a message structure from a byte string.
116120
117121
Returns the root of the message structure. Optional headersonly is a
@@ -120,12 +124,14 @@ def parsebytes(self, text, headersonly=False):
120124
the file.
121125
"""
122126
text = text.decode('ASCII', errors='surrogateescape')
123-
return self.parser.parsestr(text, headersonly)
127+
return self.parser.parsestr(text, headersonly, strictheaders)
124128

125129

126130
class BytesHeaderParser(BytesParser):
127-
def parse(self, fp, headersonly=True):
128-
return BytesParser.parse(self, fp, headersonly=True)
131+
def parse(self, fp, headersonly=True, strictheaders=True):
132+
return BytesParser.parse(
133+
self, fp, headersonly=True, strictheaders=strictheaders)
129134

130-
def parsebytes(self, text, headersonly=True):
131-
return BytesParser.parsebytes(self, text, headersonly=True)
135+
def parsebytes(self, text, headersonly=True, strictheaders=True):
136+
return BytesParser.parsebytes(
137+
self, text, headersonly=True, strictheaders=strictheaders)

Lib/http/client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,8 @@ def parse_headers(fp, _class=HTTPMessage):
233233
"""
234234
headers = _read_headers(fp)
235235
hstring = b''.join(headers).decode('iso-8859-1')
236-
return email.parser.Parser(_class=_class).parsestr(hstring)
236+
return email.parser.Parser(_class=_class).parsestr(
237+
hstring, headersonly=True, strictheaders=False)
237238

238239

239240
class HTTPResponse(io.BufferedIOBase):

Lib/test/test_httplib.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,48 @@ def test_parse_all_octets(self):
322322
self.assertIn(' folded with space', folded)
323323
self.assertTrue(folded.endswith('folded with tab'))
324324

325+
def test_parse_invalid_octets(self):
326+
# Ensure no valid header field octet breaks the parser
327+
body = (
328+
b'HTTP/1.1 200 OK\r\n'
329+
b'A-Canonical: header\r\n'
330+
b'sOMe-CRazY: heADers\r\n'
331+
b'With-UTF-8-Values: \xe2\x9c\x94\r\n'
332+
b'And-Even-\xf0\x9f\x8c\xb4: names\r\n'
333+
b'and-more: after that\r\n'
334+
b'Transfer-Encoding: chunked\r\n'
335+
b'\r\n' # End of headers
336+
b'e\r\n'
337+
b'Hello, world!\n\r\n'
338+
b'0\r\n'
339+
b'\r\n'
340+
)
341+
sock = FakeSocket(body)
342+
resp = client.HTTPResponse(sock)
343+
resp.begin()
344+
# Even if there was a charset specified in the Content-Type, that
345+
# would only apply to the actual body. Interpret the out-of-spec
346+
# response headers as ISO-8859-1, which is consistent with the
347+
# encoding used for header values and how one would generate such
348+
# a response from a WSGI server.
349+
expected = {
350+
'A-Canonical': 'header',
351+
'sOMe-CRazY': 'heADers',
352+
'With-UTF-8-Values': '\xe2\x9c\x94',
353+
'And-Even-\xf0\x9f\x8c\xb4': 'names',
354+
'and-more': 'after that',
355+
'Transfer-Encoding': 'chunked',
356+
}
357+
case_sensitive = dict(resp.getheaders())
358+
for header, value in expected.items():
359+
with self.subTest((header, value)):
360+
self.assertEqual(resp.getheader(header), value)
361+
self.assertEqual(resp.msg[header], value)
362+
self.assertIn(header, case_sensitive)
363+
self.assertNotIn(header.upper(), case_sensitive)
364+
self.assertEqual(resp.read(), b'Hello, world!\n')
365+
self.assertEqual(resp.headers.get_payload(), '')
366+
325367
def test_invalid_headers(self):
326368
conn = client.HTTPConnection('example.com')
327369
conn.sock = FakeSocket('')

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ Artem Bulgakov
255255
Dick Bulterman
256256
Bill Bumgarner
257257
Jimmy Burgett
258+
Tim Burke
258259
Charles Burkland
259260
Edmond Burnett
260261
Tommy Burnette
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:mod:`http.client` now parses non-ASCII header names.

0 commit comments

Comments
 (0)