From d12fa4dbeb769e953d40cd8f7c94a757c58cae39 Mon Sep 17 00:00:00 2001 From: rka97 Date: Wed, 10 Nov 2021 15:23:00 +0200 Subject: [PATCH] Fix bug with parsing arXiv entries with multiline fields In case the entry was an unmatched article from arXiv, the normalizer re-parsed the bibtex entry manually looking for the title, author, and arXiv id information. This manual parser fails when any of the entries (title/author/etc.) spans multiple lines. Moreover, the external bibtex parsing library used elsewhere already works quite well. This commit just uses the external parser which can handle multiline fields just fine. --- rebiber/normalize.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/rebiber/normalize.py b/rebiber/normalize.py index a2f94f1..f06bc2c 100644 --- a/rebiber/normalize.py +++ b/rebiber/normalize.py @@ -110,33 +110,36 @@ def normalize_bib(bib_db, all_bib_entries, output_bib_path, deduplicate=True, re print(log_str) output_bib_entries.append(found_bibitem) else: - bib_dict = {"arxiv_id": set()} - for line in bib_entry: - line = line.strip() - if line.startswith("@"): - bib_dict["bibkey"] = line[line.find("{")+1:line.find(",")] - else: - bib_dict[line[:line.find("=")].strip()] = line[line.find("{")+1:line.rfind("}")].strip() - - for match in re.finditer(r"(arxiv:|abs/|pdf/)(([0-9]*).([0-9]*))", line.lower()): - bib_dict["arxiv_id"].add(match.group(2)) - + bib_dict = bib_entry_parsed.entries[0] + bib_dict["arxiv_id"] = set() + for match in re.finditer( + r"(arxiv:|abs/|pdf/)(([0-9]*).([0-9]*))", bib_entry_str.lower() + ): + bib_dict["arxiv_id"].add(match.group(2)) if len(bib_dict["arxiv_id"]) == 1: bib_dict["arxiv_id"] = bib_dict["arxiv_id"].pop() bib_dict["arxiv_year"] = "20" + bib_dict["arxiv_id"].split(".")[0][:2] - - bib_entry = [line + "\n" for line in f"""@article{{{bib_dict['bibkey']}, + bib_entry = [ + line + "\n" + for line in f"""@{bib_dict['ENTRYTYPE']}{{{bib_dict['ID']}, title={{{bib_dict['title']}}}, author={{{bib_dict['author']}}}, journal={{ArXiv preprint}}, volume={{abs/{bib_dict['arxiv_id']}}}, year={{{bib_dict['arxiv_year']}}}, url={{https://arxiv.org/abs/{bib_dict['arxiv_id']}}} - }}""".split("\n")] - - log_str = "Converted. ID: %s ; Title: %s" % (original_bibkey, original_title) + }}""".split( + "\n" + ) + ] + + log_str = "Converted arXiv entry. ID: %s ; Title: %s" % ( + original_bibkey, + original_title, + ) num_converted += 1 print(log_str) + output_bib_entries.append(bib_entry) print("Num of converted items:", num_converted)