From 872000606271c52d989e53fe4cc9904343d81855 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Tue, 14 May 2024 11:47:05 +0200 Subject: [PATCH] [3.13] gh-67693: Fix urlunparse() and urlunsplit() for URIs with path starting with multiple slashes and no authority (GH-113563) (GH-119023) (cherry picked from commit e237b25a4fa5626fcd1b1848aa03f725f892e40e) Co-authored-by: Serhiy Storchaka --- Lib/test/test_urlparse.py | 70 ++++++++++++++++++- Lib/urllib/parse.py | 2 +- ...9-08-27-01-16-50.gh-issue-67693.4NIAiy.rst | 2 + 3 files changed, 70 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 236b6e4516490a..2cf03d046a5b87 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -103,7 +103,9 @@ class UrlParseTestCase(unittest.TestCase): - def checkRoundtrips(self, url, parsed, split): + def checkRoundtrips(self, url, parsed, split, url2=None): + if url2 is None: + url2 = url result = urllib.parse.urlparse(url) self.assertSequenceEqual(result, parsed) t = (result.scheme, result.netloc, result.path, @@ -111,7 +113,7 @@ def checkRoundtrips(self, url, parsed, split): self.assertSequenceEqual(t, parsed) # put it back together and it should be the same result2 = urllib.parse.urlunparse(result) - self.assertSequenceEqual(result2, url) + self.assertSequenceEqual(result2, url2) self.assertSequenceEqual(result2, result.geturl()) # the result of geturl() is a fixpoint; we can always parse it @@ -137,7 +139,7 @@ def checkRoundtrips(self, url, parsed, split): result.query, result.fragment) self.assertSequenceEqual(t, split) result2 = urllib.parse.urlunsplit(result) - self.assertSequenceEqual(result2, url) + self.assertSequenceEqual(result2, url2) self.assertSequenceEqual(result2, result.geturl()) # check the fixpoint property of re-parsing the result of geturl() @@ -175,9 +177,39 @@ def test_qs(self): def test_roundtrips(self): str_cases = [ + ('path/to/file', + ('', '', 'path/to/file', '', '', ''), + ('', '', 'path/to/file', '', '')), + ('/path/to/file', + ('', '', '/path/to/file', '', '', ''), + ('', '', '/path/to/file', '', '')), + ('//path/to/file', + ('', 'path', '/to/file', '', '', ''), + ('', 'path', '/to/file', '', '')), + ('////path/to/file', + ('', '', '//path/to/file', '', '', ''), + ('', '', '//path/to/file', '', '')), + ('scheme:path/to/file', + ('scheme', '', 'path/to/file', '', '', ''), + ('scheme', '', 'path/to/file', '', '')), + ('scheme:/path/to/file', + ('scheme', '', '/path/to/file', '', '', ''), + ('scheme', '', '/path/to/file', '', '')), + ('scheme://path/to/file', + ('scheme', 'path', '/to/file', '', '', ''), + ('scheme', 'path', '/to/file', '', '')), + ('scheme:////path/to/file', + ('scheme', '', '//path/to/file', '', '', ''), + ('scheme', '', '//path/to/file', '', '')), ('file:///tmp/junk.txt', ('file', '', '/tmp/junk.txt', '', '', ''), ('file', '', '/tmp/junk.txt', '', '')), + ('file:////tmp/junk.txt', + ('file', '', '//tmp/junk.txt', '', '', ''), + ('file', '', '//tmp/junk.txt', '', '')), + ('file://///tmp/junk.txt', + ('file', '', '///tmp/junk.txt', '', '', ''), + ('file', '', '///tmp/junk.txt', '', '')), ('imap://mail.python.org/mbox1', ('imap', 'mail.python.org', '/mbox1', '', '', ''), ('imap', 'mail.python.org', '/mbox1', '', '')), @@ -213,6 +245,38 @@ def _encode(t): for url, parsed, split in str_cases + bytes_cases: self.checkRoundtrips(url, parsed, split) + def test_roundtrips_normalization(self): + str_cases = [ + ('///path/to/file', + '/path/to/file', + ('', '', '/path/to/file', '', '', ''), + ('', '', '/path/to/file', '', '')), + ('scheme:///path/to/file', + 'scheme:/path/to/file', + ('scheme', '', '/path/to/file', '', '', ''), + ('scheme', '', '/path/to/file', '', '')), + ('file:/tmp/junk.txt', + 'file:///tmp/junk.txt', + ('file', '', '/tmp/junk.txt', '', '', ''), + ('file', '', '/tmp/junk.txt', '', '')), + ('http:/tmp/junk.txt', + 'http:///tmp/junk.txt', + ('http', '', '/tmp/junk.txt', '', '', ''), + ('http', '', '/tmp/junk.txt', '', '')), + ('https:/tmp/junk.txt', + 'https:///tmp/junk.txt', + ('https', '', '/tmp/junk.txt', '', '', ''), + ('https', '', '/tmp/junk.txt', '', '')), + ] + def _encode(t): + return (t[0].encode('ascii'), + t[1].encode('ascii'), + tuple(x.encode('ascii') for x in t[2]), + tuple(x.encode('ascii') for x in t[3])) + bytes_cases = [_encode(x) for x in str_cases] + for url, url2, parsed, split in str_cases + bytes_cases: + self.checkRoundtrips(url, parsed, split, url2) + def test_http_roundtrips(self): # urllib.parse.urlsplit treats 'http:' as an optimized special case, # so we test both 'http:' and 'https:' in all the following. diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index fc9e7c99f283be..3932bb99c7e7d1 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -525,7 +525,7 @@ def urlunsplit(components): empty query; the RFC states that these are equivalent).""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) - if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): + if netloc or (scheme and scheme in uses_netloc) or url[:2] == '//': if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url if scheme: diff --git a/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst b/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst new file mode 100644 index 00000000000000..22457df03e65c9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst @@ -0,0 +1,2 @@ +Fix :func:`urllib.parse.urlunparse` and :func:`urllib.parse.urlunsplit` for URIs with path starting with multiple slashes and no authority. +Based on patch by Ashwin Ramaswami.