From d12fa4dbeb769e953d40cd8f7c94a757c58cae39 Mon Sep 17 00:00:00 2001
From: rka97 <ahmed.khaled@princeton.edu>
Date: Wed, 10 Nov 2021 15:23:00 +0200
Subject: [PATCH] Fix bug with parsing arXiv entries with multiline fields

In case the entry was an unmatched article from arXiv, the normalizer re-parsed
the bibtex entry manually looking for the title, author, and arXiv id
information. This manual parser fails when any of the
entries (title/author/etc.) spans multiple lines. Moreover, the external bibtex
parsing library used elsewhere already works quite well. This commit just uses
the external parser which can handle multiline fields just fine.
---
 rebiber/normalize.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/rebiber/normalize.py b/rebiber/normalize.py
index a2f94f1..f06bc2c 100644
--- a/rebiber/normalize.py
+++ b/rebiber/normalize.py
@@ -110,33 +110,36 @@ def normalize_bib(bib_db, all_bib_entries, output_bib_path, deduplicate=True, re
                 print(log_str)
                 output_bib_entries.append(found_bibitem)
         else:
-            bib_dict = {"arxiv_id": set()}
-            for line in bib_entry:
-                line = line.strip()
-                if line.startswith("@"):
-                    bib_dict["bibkey"] = line[line.find("{")+1:line.find(",")]
-                else:
-                    bib_dict[line[:line.find("=")].strip()] = line[line.find("{")+1:line.rfind("}")].strip()
-                    
-                for match in re.finditer(r"(arxiv:|abs/|pdf/)(([0-9]*).([0-9]*))", line.lower()):
-                    bib_dict["arxiv_id"].add(match.group(2))
-            
+            bib_dict = bib_entry_parsed.entries[0]
+            bib_dict["arxiv_id"] = set()
+            for match in re.finditer(
+                r"(arxiv:|abs/|pdf/)(([0-9]*).([0-9]*))", bib_entry_str.lower()
+            ):
+                bib_dict["arxiv_id"].add(match.group(2))
             if len(bib_dict["arxiv_id"]) == 1:
                 bib_dict["arxiv_id"] = bib_dict["arxiv_id"].pop()
                 bib_dict["arxiv_year"] = "20" + bib_dict["arxiv_id"].split(".")[0][:2]
-                
-                bib_entry = [line + "\n" for line in f"""@article{{{bib_dict['bibkey']},
+                bib_entry = [
+                    line + "\n"
+                    for line in f"""@{bib_dict['ENTRYTYPE']}{{{bib_dict['ID']},
                   title={{{bib_dict['title']}}},
                   author={{{bib_dict['author']}}},
                   journal={{ArXiv preprint}},
                   volume={{abs/{bib_dict['arxiv_id']}}},
                   year={{{bib_dict['arxiv_year']}}},
                   url={{https://arxiv.org/abs/{bib_dict['arxiv_id']}}}
-                }}""".split("\n")]
-
-                log_str = "Converted. ID: %s ; Title: %s" % (original_bibkey, original_title)
+                }}""".split(
+                        "\n"
+                    )
+                ]
+
+                log_str = "Converted arXiv entry. ID: %s ; Title: %s" % (
+                    original_bibkey,
+                    original_title,
+                )
                 num_converted += 1
                 print(log_str)
+
                 
             output_bib_entries.append(bib_entry)
     print("Num of converted items:", num_converted)