bpo-37093: Allow http.client to parse non-ASCII header names

tipabu · tipabu · commit 568f6db52afb · 2022-11-03T12:16:00.000-07:00
Previously, when http.client tried to parse a response from an
out-of-spec server that sent a header with a non-ASCII name,
email.feedparser would assume that the non-compliant header must be
part of a message body and abort parsing. However, http.client already
determined the boundary between headers and body and only passed the
headers to the parser. As a result, any headers after the first
non-compliant one would be silently (!) ignored. This could include
headers important for message framing like Content-Length and
Transfer-Encoding.

In the long-long ago, this parsing was handled by the rfc822 module,
which didn't care about which bytes were in the header as long as there
was a colon in the line.

Now, add an optional argument to the email parsers to decide whether to
require strict RFC-compliant header names. Default this to True to
minimize the possibility of breaking other callers. In http.client,
which already knows where the headers end and body begins, use False.

Note that the non-ASCII names will be decoded as ISO-8859-1 in keeping
with how header values are decoded.
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
@@ -136,7 +136,7 @@ def __next__(self):
 class FeedParser:
     """A feed-style parser of email."""
 
-    def __init__(self, _factory=None, *, policy=compat32):
+    def __init__(self, _factory=None, *, policy=compat32, strictheaders=True):
         """_factory is called with no arguments to create a new message obj
 
         The policy keyword specifies a policy object that controls a number of
@@ -165,6 +165,7 @@ def __init__(self, _factory=None, *, policy=compat32):
         self._cur = None
         self._last = None
         self._headersonly = False
+        self._strictheaders = strictheaders
 
     # Non-public interface for supporting Parser's headersonly flag
     def _set_headersonly(self):
@@ -225,14 +226,15 @@ def _parsegen(self):
             if line is NeedMoreData:
                 yield NeedMoreData
                 continue
-            if not headerRE.match(line):
+            if NLCRE.match(line):
+                break
+            elif not headerRE.match(line) and self._strictheaders:
                 # If we saw the RFC defined header/body separator
                 # (i.e. newline), just throw it away. Otherwise the line is
                 # part of the body so push it back.
-                if not NLCRE.match(line):
-                    defect = errors.MissingHeaderBodySeparatorDefect()
-                    self.policy.handle_defect(self._cur, defect)
-                    self._input.unreadline(line)
+                defect = errors.MissingHeaderBodySeparatorDefect()
+                self.policy.handle_defect(self._cur, defect)
+                self._input.unreadline(line)
                 break
             headers.append(line)
         # Done with the headers, so parse them and figure out what we're
diff --git a/Lib/email/parser.py b/Lib/email/parser.py
@@ -38,15 +38,16 @@ def __init__(self, _class=None, *, policy=compat32):
         self._class = _class
         self.policy = policy
 
-    def parse(self, fp, headersonly=False):
+    def parse(self, fp, headersonly=False, strictheaders=True):
         """Create a message structure from the data in a file.
 
         Reads all the data from the file and returns the root of the message
         structure.  Optional headersonly is a flag specifying whether to stop
         parsing after reading the headers or not.  The default is False,
         meaning it parses the entire contents of the file.
         """
-        feedparser = FeedParser(self._class, policy=self.policy)
+        feedparser = FeedParser(
+            self._class, policy=self.policy, strictheaders=strictheaders)
         if headersonly:
             feedparser._set_headersonly()
         while True:
@@ -56,24 +57,27 @@ def parse(self, fp, headersonly=False):
             feedparser.feed(data)
         return feedparser.close()
 
-    def parsestr(self, text, headersonly=False):
+    def parsestr(self, text, headersonly=False, strictheaders=True):
         """Create a message structure from a string.
 
         Returns the root of the message structure.  Optional headersonly is a
         flag specifying whether to stop parsing after reading the headers or
         not.  The default is False, meaning it parses the entire contents of
         the file.
         """
-        return self.parse(StringIO(text), headersonly=headersonly)
+        return self.parse(
+            StringIO(text),
+            headersonly=headersonly,
+            strictheaders=strictheaders)
 
 
 
 class HeaderParser(Parser):
-    def parse(self, fp, headersonly=True):
-        return Parser.parse(self, fp, True)
+    def parse(self, fp, headersonly=True, strictheaders=True):
+        return Parser.parse(self, fp, True, strictheaders=strictheaders)
 
-    def parsestr(self, text, headersonly=True):
-        return Parser.parsestr(self, text, True)
+    def parsestr(self, text, headersonly=True, strictheaders=True):
+        return Parser.parsestr(self, text, True, strictheaders=strictheaders)
 
 
 class BytesParser:
@@ -96,7 +100,7 @@ def __init__(self, *args, **kw):
         """
         self.parser = Parser(*args, **kw)
 
-    def parse(self, fp, headersonly=False):
+    def parse(self, fp, headersonly=False, strictheaders=True):
         """Create a message structure from the data in a binary file.
 
         Reads all the data from the file and returns the root of the message
@@ -106,12 +110,12 @@ def parse(self, fp, headersonly=False):
         """
         fp = TextIOWrapper(fp, encoding='ascii', errors='surrogateescape')
         try:
-            return self.parser.parse(fp, headersonly)
+            return self.parser.parse(fp, headersonly, strictheaders)
         finally:
             fp.detach()
 
 
-    def parsebytes(self, text, headersonly=False):
+    def parsebytes(self, text, headersonly=False, strictheaders=True):
         """Create a message structure from a byte string.
 
         Returns the root of the message structure.  Optional headersonly is a
@@ -120,12 +124,14 @@ def parsebytes(self, text, headersonly=False):
         the file.
         """
         text = text.decode('ASCII', errors='surrogateescape')
-        return self.parser.parsestr(text, headersonly)
+        return self.parser.parsestr(text, headersonly, strictheaders)
 
 
 class BytesHeaderParser(BytesParser):
-    def parse(self, fp, headersonly=True):
-        return BytesParser.parse(self, fp, headersonly=True)
+    def parse(self, fp, headersonly=True, strictheaders=True):
+        return BytesParser.parse(
+            self, fp, headersonly=True, strictheaders=strictheaders)
 
-    def parsebytes(self, text, headersonly=True):
-        return BytesParser.parsebytes(self, text, headersonly=True)
+    def parsebytes(self, text, headersonly=True, strictheaders=True):
+        return BytesParser.parsebytes(
+            self, text, headersonly=True, strictheaders=strictheaders)
diff --git a/Lib/http/client.py b/Lib/http/client.py
@@ -233,7 +233,8 @@ def parse_headers(fp, _class=HTTPMessage):
     """
     headers = _read_headers(fp)
     hstring = b''.join(headers).decode('iso-8859-1')
-    return email.parser.Parser(_class=_class).parsestr(hstring)
+    return email.parser.Parser(_class=_class).parsestr(
+        hstring, headersonly=True, strictheaders=False)
 
 
 class HTTPResponse(io.BufferedIOBase):
diff --git a/Lib/test/test_httplib.py b/Lib/test/test_httplib.py
@@ -323,6 +323,48 @@ def test_parse_all_octets(self):
             self.assertIn(' folded with space', folded)
             self.assertTrue(folded.endswith('folded with tab'))
 
+    def test_parse_invalid_octets(self):
+        # Ensure no valid header field octet breaks the parser
+        body = (
+            b'HTTP/1.1 200 OK\r\n'
+            b'A-Canonical: header\r\n'
+            b'sOMe-CRazY: heADers\r\n'
+            b'With-UTF-8-Values: \xe2\x9c\x94\r\n'
+            b'And-Even-\xf0\x9f\x8c\xb4: names\r\n'
+            b'and-more: after that\r\n'
+            b'Transfer-Encoding: chunked\r\n'
+            b'\r\n'  # End of headers
+            b'e\r\n'
+            b'Hello, world!\n\r\n'
+            b'0\r\n'
+            b'\r\n'
+        )
+        sock = FakeSocket(body)
+        resp = client.HTTPResponse(sock)
+        resp.begin()
+        # Even if there was a charset specified in the Content-Type, that
+        # would only apply to the actual body. Interpret the out-of-spec
+        # response headers as ISO-8859-1, which is consistent with the
+        # encoding used for header values and how one would generate such
+        # a response from a WSGI server.
+        expected = {
+            'A-Canonical': 'header',
+            'sOMe-CRazY': 'heADers',
+            'With-UTF-8-Values': '\xe2\x9c\x94',
+            'And-Even-\xf0\x9f\x8c\xb4': 'names',
+            'and-more': 'after that',
+            'Transfer-Encoding': 'chunked',
+        }
+        case_sensitive = dict(resp.getheaders())
+        for header, value in expected.items():
+            with self.subTest((header, value)):
+                self.assertEqual(resp.getheader(header), value)
+                self.assertEqual(resp.msg[header], value)
+                self.assertIn(header, case_sensitive)
+                self.assertNotIn(header.upper(), case_sensitive)
+        self.assertEqual(resp.read(), b'Hello, world!\n')
+        self.assertEqual(resp.headers.get_payload(), '')
+
     def test_invalid_headers(self):
         conn = client.HTTPConnection('example.com')
         conn.sock = FakeSocket('')
diff --git a/Misc/ACKS b/Misc/ACKS
@@ -257,6 +257,7 @@ Artem Bulgakov
 Dick Bulterman
 Bill Bumgarner
 Jimmy Burgett
+Tim Burke
 Charles Burkland
 Edmond Burnett
 Tommy Burnette
diff --git a/Misc/NEWS.d/next/Library/2019-06-17-08-42-34.bpo-37093.T2sOF8.rst b/Misc/NEWS.d/next/Library/2019-06-17-08-42-34.bpo-37093.T2sOF8.rst
@@ -0,0 +1 @@
+:mod:`http.client` now parses non-ASCII header names.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+:mod:`http.client` now parses non-ASCII header names.