Add functionality to replace text

forward port of 8928445 by https://github.com/ArifRasim
py-pdf · Aug 9, 2024 · 289c94e · 289c94e
1 parent d3412d2
commit 289c94e
Show file tree

Hide file tree

Showing 8 changed files with 161 additions and 11 deletions.
diff --git a/camelot/cli.py b/camelot/cli.py
@@ -72,6 +72,12 @@ def set_config(self, key, value):
     help="Characters that should be stripped from a string before"
     " assigning it to a cell.",
 )
+@click.option(
+    "-replace",
+    "--replace_text",
+    help="Characters that should be replaced from a string before"
+    " assigning it to a cell.",
+)
 @click.option(
     "-M",
     "--margins",

diff --git a/camelot/io.py b/camelot/io.py
@@ -58,6 +58,9 @@ def read_pdf(
     strip_text : str, optional (default: '')
         Characters that should be stripped from a string before
         assigning it to a cell.
+    replace_text : dict, optional (default: {})
+        Characters that should be replaced from a string before
+        assigning it to a cell.
     row_tol^ : int, optional (default: 2)
         Tolerance parameter used to combine text vertically,
         to generate rows.

diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
@@ -63,6 +63,9 @@ class Lattice(BaseParser):
     strip_text : str, optional (default: '')
         Characters that should be stripped from a string before
         assigning it to a cell.
+    replace_text : dict, optional (default: {})
+        Characters that should be replaced from a string before
+        assigning it to a cell.
     line_tol : int, optional (default: 2)
         Tolerance parameter used to merge close vertical and horizontal
         lines.
@@ -99,6 +102,7 @@ def __init__(
         split_text=False,
         flag_size=False,
         strip_text="",
+        replace_text={},
         line_tol=2,
         joint_tol=2,
         threshold_blocksize=15,
@@ -117,6 +121,7 @@ def __init__(
         self.split_text = split_text
         self.flag_size = flag_size
         self.strip_text = strip_text
+        self.replace_text = replace_text
         self.line_tol = line_tol
         self.joint_tol = joint_tol
         self.threshold_blocksize = threshold_blocksize
@@ -360,6 +365,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
                     split_text=self.split_text,
                     flag_size=self.flag_size,
                     strip_text=self.strip_text,
+                    replace_text=self.replace_text,
                 )
                 if indices[0][:2] != (-1, -1):
                     pos_errors.append(error)

diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
@@ -45,6 +45,9 @@ class Stream(BaseParser):
     strip_text : str, optional (default: '')
         Characters that should be stripped from a string before
         assigning it to a cell.
+    replace_text : dict, optional (default: {})
+        Characters that should be replaced from a string before
+        assigning it to a cell.
     edge_tol : int, optional (default: 50)
         Tolerance parameter for extending textedges vertically.
     row_tol : int, optional (default: 2)
@@ -64,6 +67,7 @@ def __init__(
         split_text=False,
         flag_size=False,
         strip_text="",
+        replace_text={},
         edge_tol=50,
         row_tol=2,
         column_tol=0,
@@ -76,6 +80,7 @@ def __init__(
         self.split_text = split_text
         self.flag_size = flag_size
         self.strip_text = strip_text
+        self.replace_text = replace_text
         self.edge_tol = edge_tol
         self.row_tol = row_tol
         self.column_tol = column_tol
@@ -414,6 +419,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
                     split_text=self.split_text,
                     flag_size=self.flag_size,
                     strip_text=self.strip_text,
+                    replace_text=self.replace_text,
                 )
                 if indices[:2] != (-1, -1):
                     pos_errors.append(error)

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -505,12 +505,33 @@ def text_strip(text, strip=""):
     return stripped
 
 
+def text_replace(text, replace={}):
+    """Replaces the keys for the values that are present in `text`.
+    Parameters
+    ----------
+    text : str
+        Text to process and modify.
+    replace : dict, optional (default: {})
+        key value pairs, where keys are swapped for the values in `text`.
+    Returns
+    -------
+    text : str
+    """
+    if replace is {}:
+        return text
+
+    for key, value in replace.items():
+        text = text.replace(key, value)
+
+    return text
+
+
 # TODO: combine the following functions into a TextProcessor class which
 # applies corresponding transformations sequentially
 # (inspired from sklearn.pipeline.Pipeline)
 
 
-def flag_font_size(textline, direction, strip_text=""):
+def flag_font_size(textline, direction, strip_text="", replace_text={}):
     """Flags super/subscripts in text by enclosing them with <s></s>.
     May give false positives.
 
@@ -523,6 +544,9 @@ def flag_font_size(textline, direction, strip_text=""):
     strip_text : str, optional (default: '')
         Characters that should be stripped from a string before
         assigning it to a cell.
+    replace_text : dict, optional (default: {})
+        Characters that should be replaced from a string before
+        assigning it to a cell.
 
     Returns
     -------
@@ -559,10 +583,13 @@ def flag_font_size(textline, direction, strip_text=""):
         fstring = "".join(flist)
     else:
         fstring = "".join([t.get_text() for t in textline])
+    fstring = text_replace(fstring, replace_text)
     return text_strip(fstring, strip_text)
 
 
-def split_textline(table, textline, direction, flag_size=False, strip_text=""):
+def split_textline(
+    table, textline, direction, flag_size=False, strip_text="", replace_text={}
+):
     """Splits PDFMiner LTTextLine into substrings if it spans across
     multiple rows/columns.
 
@@ -580,6 +607,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
     strip_text : str, optional (default: '')
         Characters that should be stripped from a string before
         assigning it to a cell.
+    replace_text : dict, optional (default: {})
+        Characters that should be replaced from a string before
+        assigning it to a cell.
 
     Returns
     -------
@@ -668,20 +698,28 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
                     key[0],
                     key[1],
                     flag_font_size(
-                        [t[2] for t in chars], direction, strip_text=strip_text
+                        [t[2] for t in chars],
+                        direction,
+                        strip_text=strip_text,
+                        replace_text=replace_text,
                     ),
                 )
             )
         else:
-            gchars = [t[2].get_text() for t in chars]
-            grouped_chars.append(
-                (key[0], key[1], text_strip("".join(gchars), strip_text))
-            )
+            gchars = "".join([t[2].get_text() for t in chars])
+            gchars = text_replace(gchars, replace_text)
+            grouped_chars.append((key[0], key[1], text_strip(gchars, strip_text)))
     return grouped_chars
 
 
 def get_table_index(
-    table, t, direction, split_text=False, flag_size=False, strip_text=""
+    table,
+    t,
+    direction,
+    split_text=False,
+    flag_size=False,
+    strip_text="",
+    replace_text={},
 ):
     """Gets indices of the table cell where given text object lies by
     comparing their y and x-coordinates.
@@ -703,6 +741,9 @@ def get_table_index(
     strip_text : str, optional (default: '')
         Characters that should be stripped from a string before
         assigning it to a cell.
+    replace_text : dict, optional (default: {})
+        Characters that should be replaced from a string before
+        assigning it to a cell.
 
     Returns
     -------
@@ -761,7 +802,12 @@ def get_table_index(
     if split_text:
         return (
             split_textline(
-                table, t, direction, flag_size=flag_size, strip_text=strip_text
+                table,
+                t,
+                direction,
+                flag_size=flag_size,
+                strip_text=strip_text,
+                replace_text=replace_text,
             ),
             error,
         )
@@ -772,13 +818,20 @@ def get_table_index(
                     (
                         r_idx,
                         c_idx,
-                        flag_font_size(t._objs, direction, strip_text=strip_text),
+                        flag_font_size(
+                            t._objs,
+                            direction,
+                            strip_text=strip_text,
+                            replace_text=replace_text,
+                        ),
                     )
                 ],
                 error,
             )
         else:
-            return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
+            text = t.get_text()
+            text = text_replace(text, replace_text)
+            return [(r_idx, c_idx, text_strip(text, strip_text))], error
 
 
 def compute_accuracy(error_weights):

diff --git a/tests/data.py b/tests/data.py
@@ -2306,6 +2306,33 @@
     ["ChâteauLéoube2016", "10€"],
 ]
 
+data_stream_replace_text = [
+    ["VinsauVerre", ""],
+    ["LesBlancs", "12.5CL"],
+    ["A.O.PCôtesduRhône", ""],
+    ["DomainedelaGuicharde«Autourdelachapelle»3316", "8$"],
+    ["A.O.PVacqueyras", ""],
+    ["DomainedeMontvac«Melodine»3316", "10$"],
+    ["A.O.PChâteauneufduPape", ""],
+    ["DomainedeBeaurenard3317", "13$"],
+    ["A.O.PCôteauxduLanguedoc", ""],
+    ["VillaTempora«Untempspourelle»3314", "9$"],
+    ["A.O.PCôtesdeProvence", ""],
+    ["ChâteauGrandBoise3317", "9$"],
+    ["LesRosés", "125CL"],
+    ["A.O.PCôtesduRhône", ""],
+    ["DomainedelaFlorane«AfleurdePampre»3316", "8$"],
+    ["FamilleCoulon(DomaineBeaurenard)Biotifulfox3317", "8$"],
+    ["A.O.PVacqueyras", ""],
+    ["DomainedeMontvac3317", "9$"],
+    ["A.O.PLanguedoc", ""],
+    ["DomainedeJoncas«Nébla»3315", "8$"],
+    ["VillaTempora«L’arroseurarrosé»3315", "9$"],
+    ["A.O.PCôtesdeProvence", ""],
+    ["ChâteauGrandBoise«SainteVictoire»3317", "9$"],
+    ["ChâteauLéoube3316", "10$"],
+]
+
 data_stream_edge_tol = [
     ["Key figures", ""],
     ["", "2016"],
@@ -2368,6 +2395,32 @@
     ["4171_1", "0.07", "173.9", "58.1%", "1.6%", "2.1%", "0.5%"],
 ]
 
+data_lattice_text_replace = [
+    [
+        "Cycle \nName",
+        "KI \n(1/km)",
+        "Distance \n(mi)",
+        "Percent Fuel Savings",
+        "",
+        "",
+        "",
+    ],
+    [
+        "",
+        "",
+        "",
+        "Improved \nSpeed",
+        "Decreased \nAccel",
+        "Eliminate \nStops",
+        "Decreased \nIdle",
+    ],
+    ["2012_2", "3,30", "1,3", "5,9%", "9,5%", "29,2%", "17,4%"],
+    ["2145_1", "0,68", "11,2", "2,4%", "0,1%", "9,5%", "2,7%"],
+    ["4234_1", "0,59", "58,7", "8,5%", "1,3%", "8,5%", "3,3%"],
+    ["2032_2", "0,17", "57,8", "21,7%", "0,3%", "2,7%", "1,2%"],
+    ["4171_1", "0,07", "173,9", "58,1%", "1,6%", "2,1%", "0,5%"],
+]
+
 data_lattice_table_rotated = [
     [
         "State",

diff --git a/tests/test_lattice.py b/tests/test_lattice.py
@@ -20,6 +20,17 @@ def test_lattice(testdir):
     assert_frame_equal(df, tables[0].df)
 
 
+@skip_on_windows
+def test_lattice_text_replace(testdir):
+    df = pd.DataFrame(data_lattice_text_replace)
+
+    filename = os.path.join(
+        testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
+    )
+    tables = camelot.read_pdf(filename, pages="2", replace_text={".": ","})
+    assert_frame_equal(df, tables[0].df)
+
+
 @skip_on_windows
 def test_lattice_table_rotated(testdir):
     df = pd.DataFrame(data_lattice_table_rotated)

diff --git a/tests/test_stream.py b/tests/test_stream.py
@@ -98,6 +98,18 @@ def test_stream_strip_text(testdir):
     tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
     assert_frame_equal(df, tables[0].df)
 
+def test_stream_replace_text(testdir):
+    df = pd.DataFrame(data_stream_replace_text)
+
+    filename = os.path.join(testdir, "detect_vertical_false.pdf")
+    tables = camelot.read_pdf(
+        filename,
+        flavor="stream",
+        strip_text=" ,\n",
+        replace_text={"€": "$", "20": "33"},
+    )
+
+    assert_frame_equal(df, tables[0].df)
 
 def test_stream_edge_tol(testdir):
     df = pd.DataFrame(data_stream_edge_tol)