Skip to content

Commit

Permalink
Fix erroneous encoding of % char in URL field of fetch.txt which …
Browse files Browse the repository at this point in the history
…could break any already properly encoded URLs. This was due to a misinterpretation of the spec which stated that `%` should _only_ be URL encoded for the `filename` field. Per spec _only_ whitespace should be encoded in the URL field.

Updated unit tests to reflect these changes.
Bump version.
  • Loading branch information
mikedarcy committed May 7, 2024
1 parent 4e8f872 commit 36bfad1
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 10 deletions.
8 changes: 4 additions & 4 deletions bdbag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

logger = logging.getLogger(__name__)

__version__ = "1.7.2"
__version__ = "1.7.3"
__bagit_version__ = "1.8.1"
__bagit_profile_version__ = "1.3.1"

Expand Down Expand Up @@ -146,11 +146,11 @@ def parse_content_disposition(value): # pragma: no cover

# Per the bagit spec, we just want to replace (%,\r,\n,\t, ) for storage in fetch.txt, but this is also applicable
# to URI/IRI storage in ro-metadata as well
def escape_uri(uri, encode_whitespace=True):
def escape_uri(uri, encode_whitespace=True, encode_other=False):
if not uri:
return uri

uri = uri.replace("%", "%25").replace("\n", "%0A").replace("\r", "%0D")
if encode_other:
uri = uri.replace("%", "%25").replace("\n", "%0A").replace("\r", "%0D")
if encode_whitespace:
uri = uri.replace(" ", "%20").replace("\t", "%09")

Expand Down
2 changes: 1 addition & 1 deletion bdbag/bdbag_ro.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def add_file_metadata(manifest,
retrieved_from = None

if local_path:
uri = escape_uri(ensure_payload_path_prefix(local_path), encode_whitespace=False)
uri = escape_uri(ensure_payload_path_prefix(local_path), encode_whitespace=False, encode_other=True)
if source_url:
retrieved_from = dict(retrievedFrom=source_url)

Expand Down
2 changes: 1 addition & 1 deletion bdbag/bdbagit.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def _make_fetch_file(path, remote_entries):
fetch_file.write("%s\t%s\t%s\n" %
(escape_uri(remote_entries[filename]['url']),
remote_entries[filename]['length'],
escape_uri(_denormalize_filename(filename), encode_whitespace=False)))
escape_uri(_denormalize_filename(filename), encode_whitespace=False, encode_other=True)))


def _find_tag_files(bag_dir):
Expand Down
6 changes: 3 additions & 3 deletions test/test-data/test-config/test-fetch-manifest-encoding.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"url":"https://raw.githubusercontent.com/fair-research/bdbag/master/test/test-data/test-http-encoded/test fetch%http®.txt",
"url":"https://raw.githubusercontent.com/fair-research/bdbag/master/test/test-data/test-http-encoded/test fetch%25http®.txt",
"length":201,
"filename":"test fetch http®.txt",
"md5":"f3ad851f4213d41ce9690542010bffa0",
Expand All @@ -9,7 +9,7 @@
"sha512":"1bdbb4fc33dbea55ee0e5473062cbd003221c571c6110e2b3d2ebe5288a95a74ccd60fbbbcbd42fcfa4cd320b980ca27d537e6d29e60f73822e301a7ba6c1bfa"
},
{
"url":"https://raw.githubusercontent.com/fair-research/bdbag/master/test/test-data/test-http-encoded/test fetch%http®.txt",
"url":"https://raw.githubusercontent.com/fair-research/bdbag/master/test/test-data/test-http-encoded/test\tfetch%25http®.txt",
"length":201,
"filename":"test\nfetch\nhttp®.txt",
"md5":"f3ad851f4213d41ce9690542010bffa0",
Expand All @@ -18,7 +18,7 @@
"sha512":"1bdbb4fc33dbea55ee0e5473062cbd003221c571c6110e2b3d2ebe5288a95a74ccd60fbbbcbd42fcfa4cd320b980ca27d537e6d29e60f73822e301a7ba6c1bfa"
},
{
"url":"https://raw.githubusercontent.com/fair-research/bdbag/master/test/test-data/test-http-encoded/test fetch%http®.txt",
"url":"https://raw.githubusercontent.com/fair-research/bdbag/master/test/test-data/test-http-encoded/test%20fetch%25http®.txt",
"length":201,
"filename":"test%fetch http®\r\n.txt",
"md5":"f3ad851f4213d41ce9690542010bffa0",
Expand Down
2 changes: 1 addition & 1 deletion test/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_bag_with_unencoded_utf8_remote_file_manifest(self):
fetch_txt = ff.read()
self.assertIn(
u'https://raw.githubusercontent.com/fair-research/bdbag/master/test/test-data/test-http-encoded/'
u'test%20fetch%25http®.txt\t201\tdata/test%0Afetch%0Ahttp®.txt', fetch_txt)
u'test%09fetch%25http®.txt\t201\tdata/test%0Afetch%0Ahttp®.txt', fetch_txt)
self.assertIn(
u'https://raw.githubusercontent.com/fair-research/bdbag/master/test/test-data/test-http-encoded/'
u'test%20fetch%25http®.txt\t201\tdata/test fetch http®.txt', fetch_txt)
Expand Down

0 comments on commit 36bfad1

Please sign in to comment.