Skip to content

Commit 568f6db

Browse files
committed
bpo-37093: Allow http.client to parse non-ASCII header names
Previously, when http.client tried to parse a response from an out-of-spec server that sent a header with a non-ASCII name, email.feedparser would assume that the non-compliant header must be part of a message body and abort parsing. However, http.client already determined the boundary between headers and body and only passed the headers to the parser. As a result, any headers after the first non-compliant one would be silently (!) ignored. This could include headers important for message framing like Content-Length and Transfer-Encoding. In the long-long ago, this parsing was handled by the rfc822 module, which didn't care about which bytes were in the header as long as there was a colon in the line. Now, add an optional argument to the email parsers to decide whether to require strict RFC-compliant header names. Default this to True to minimize the possibility of breaking other callers. In http.client, which already knows where the headers end and body begins, use False. Note that the non-ASCII names will be decoded as ISO-8859-1 in keeping with how header values are decoded.
1 parent 0faa0ba commit 568f6db

6 files changed

Lines changed: 76 additions & 23 deletions

File tree

Lib/email/feedparser.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def __next__(self):
136136
class FeedParser:
137137
"""A feed-style parser of email."""
138138

139-
def __init__(self, _factory=None, *, policy=compat32):
139+
def __init__(self, _factory=None, *, policy=compat32, strictheaders=True):
140140
"""_factory is called with no arguments to create a new message obj
141141
142142
The policy keyword specifies a policy object that controls a number of
@@ -165,6 +165,7 @@ def __init__(self, _factory=None, *, policy=compat32):
165165
self._cur = None
166166
self._last = None
167167
self._headersonly = False
168+
self._strictheaders = strictheaders
168169

169170
# Non-public interface for supporting Parser's headersonly flag
170171
def _set_headersonly(self):
@@ -225,14 +226,15 @@ def _parsegen(self):
225226
if line is NeedMoreData:
226227
yield NeedMoreData
227228
continue
228-
if not headerRE.match(line):
229+
if NLCRE.match(line):
230+
break
231+
elif not headerRE.match(line) and self._strictheaders:
229232
# If we saw the RFC defined header/body separator
230233
# (i.e. newline), just throw it away. Otherwise the line is
231234
# part of the body so push it back.
232-
if not NLCRE.match(line):
233-
defect = errors.MissingHeaderBodySeparatorDefect()
234-
self.policy.handle_defect(self._cur, defect)
235-
self._input.unreadline(line)
235+
defect = errors.MissingHeaderBodySeparatorDefect()
236+
self.policy.handle_defect(self._cur, defect)
237+
self._input.unreadline(line)
236238
break
237239
headers.append(line)
238240
# Done with the headers, so parse them and figure out what we're

Lib/email/parser.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,16 @@ def __init__(self, _class=None, *, policy=compat32):
3838
self._class = _class
3939
self.policy = policy
4040

41-
def parse(self, fp, headersonly=False):
41+
def parse(self, fp, headersonly=False, strictheaders=True):
4242
"""Create a message structure from the data in a file.
4343
4444
Reads all the data from the file and returns the root of the message
4545
structure. Optional headersonly is a flag specifying whether to stop
4646
parsing after reading the headers or not. The default is False,
4747
meaning it parses the entire contents of the file.
4848
"""
49-
feedparser = FeedParser(self._class, policy=self.policy)
49+
feedparser = FeedParser(
50+
self._class, policy=self.policy, strictheaders=strictheaders)
5051
if headersonly:
5152
feedparser._set_headersonly()
5253
while True:
@@ -56,24 +57,27 @@ def parse(self, fp, headersonly=False):
5657
feedparser.feed(data)
5758
return feedparser.close()
5859

59-
def parsestr(self, text, headersonly=False):
60+
def parsestr(self, text, headersonly=False, strictheaders=True):
6061
"""Create a message structure from a string.
6162
6263
Returns the root of the message structure. Optional headersonly is a
6364
flag specifying whether to stop parsing after reading the headers or
6465
not. The default is False, meaning it parses the entire contents of
6566
the file.
6667
"""
67-
return self.parse(StringIO(text), headersonly=headersonly)
68+
return self.parse(
69+
StringIO(text),
70+
headersonly=headersonly,
71+
strictheaders=strictheaders)
6872

6973

7074

7175
class HeaderParser(Parser):
72-
def parse(self, fp, headersonly=True):
73-
return Parser.parse(self, fp, True)
76+
def parse(self, fp, headersonly=True, strictheaders=True):
77+
return Parser.parse(self, fp, True, strictheaders=strictheaders)
7478

75-
def parsestr(self, text, headersonly=True):
76-
return Parser.parsestr(self, text, True)
79+
def parsestr(self, text, headersonly=True, strictheaders=True):
80+
return Parser.parsestr(self, text, True, strictheaders=strictheaders)
7781

7882

7983
class BytesParser:
@@ -96,7 +100,7 @@ def __init__(self, *args, **kw):
96100
"""
97101
self.parser = Parser(*args, **kw)
98102

99-
def parse(self, fp, headersonly=False):
103+
def parse(self, fp, headersonly=False, strictheaders=True):
100104
"""Create a message structure from the data in a binary file.
101105
102106
Reads all the data from the file and returns the root of the message
@@ -106,12 +110,12 @@ def parse(self, fp, headersonly=False):
106110
"""
107111
fp = TextIOWrapper(fp, encoding='ascii', errors='surrogateescape')
108112
try:
109-
return self.parser.parse(fp, headersonly)
113+
return self.parser.parse(fp, headersonly, strictheaders)
110114
finally:
111115
fp.detach()
112116

113117

114-
def parsebytes(self, text, headersonly=False):
118+
def parsebytes(self, text, headersonly=False, strictheaders=True):
115119
"""Create a message structure from a byte string.
116120
117121
Returns the root of the message structure. Optional headersonly is a
@@ -120,12 +124,14 @@ def parsebytes(self, text, headersonly=False):
120124
the file.
121125
"""
122126
text = text.decode('ASCII', errors='surrogateescape')
123-
return self.parser.parsestr(text, headersonly)
127+
return self.parser.parsestr(text, headersonly, strictheaders)
124128

125129

126130
class BytesHeaderParser(BytesParser):
127-
def parse(self, fp, headersonly=True):
128-
return BytesParser.parse(self, fp, headersonly=True)
131+
def parse(self, fp, headersonly=True, strictheaders=True):
132+
return BytesParser.parse(
133+
self, fp, headersonly=True, strictheaders=strictheaders)
129134

130-
def parsebytes(self, text, headersonly=True):
131-
return BytesParser.parsebytes(self, text, headersonly=True)
135+
def parsebytes(self, text, headersonly=True, strictheaders=True):
136+
return BytesParser.parsebytes(
137+
self, text, headersonly=True, strictheaders=strictheaders)

Lib/http/client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,8 @@ def parse_headers(fp, _class=HTTPMessage):
233233
"""
234234
headers = _read_headers(fp)
235235
hstring = b''.join(headers).decode('iso-8859-1')
236-
return email.parser.Parser(_class=_class).parsestr(hstring)
236+
return email.parser.Parser(_class=_class).parsestr(
237+
hstring, headersonly=True, strictheaders=False)
237238

238239

239240
class HTTPResponse(io.BufferedIOBase):

Lib/test/test_httplib.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,48 @@ def test_parse_all_octets(self):
323323
self.assertIn(' folded with space', folded)
324324
self.assertTrue(folded.endswith('folded with tab'))
325325

326+
def test_parse_invalid_octets(self):
327+
# Ensure no valid header field octet breaks the parser
328+
body = (
329+
b'HTTP/1.1 200 OK\r\n'
330+
b'A-Canonical: header\r\n'
331+
b'sOMe-CRazY: heADers\r\n'
332+
b'With-UTF-8-Values: \xe2\x9c\x94\r\n'
333+
b'And-Even-\xf0\x9f\x8c\xb4: names\r\n'
334+
b'and-more: after that\r\n'
335+
b'Transfer-Encoding: chunked\r\n'
336+
b'\r\n' # End of headers
337+
b'e\r\n'
338+
b'Hello, world!\n\r\n'
339+
b'0\r\n'
340+
b'\r\n'
341+
)
342+
sock = FakeSocket(body)
343+
resp = client.HTTPResponse(sock)
344+
resp.begin()
345+
# Even if there was a charset specified in the Content-Type, that
346+
# would only apply to the actual body. Interpret the out-of-spec
347+
# response headers as ISO-8859-1, which is consistent with the
348+
# encoding used for header values and how one would generate such
349+
# a response from a WSGI server.
350+
expected = {
351+
'A-Canonical': 'header',
352+
'sOMe-CRazY': 'heADers',
353+
'With-UTF-8-Values': '\xe2\x9c\x94',
354+
'And-Even-\xf0\x9f\x8c\xb4': 'names',
355+
'and-more': 'after that',
356+
'Transfer-Encoding': 'chunked',
357+
}
358+
case_sensitive = dict(resp.getheaders())
359+
for header, value in expected.items():
360+
with self.subTest((header, value)):
361+
self.assertEqual(resp.getheader(header), value)
362+
self.assertEqual(resp.msg[header], value)
363+
self.assertIn(header, case_sensitive)
364+
self.assertNotIn(header.upper(), case_sensitive)
365+
self.assertEqual(resp.read(), b'Hello, world!\n')
366+
self.assertEqual(resp.headers.get_payload(), '')
367+
326368
def test_invalid_headers(self):
327369
conn = client.HTTPConnection('example.com')
328370
conn.sock = FakeSocket('')

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ Artem Bulgakov
257257
Dick Bulterman
258258
Bill Bumgarner
259259
Jimmy Burgett
260+
Tim Burke
260261
Charles Burkland
261262
Edmond Burnett
262263
Tommy Burnette
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:mod:`http.client` now parses non-ASCII header names.

0 commit comments

Comments
 (0)