Skip to content

Commit

Permalink
Add functionality to replace text
Browse files Browse the repository at this point in the history
  • Loading branch information
bosd committed Aug 9, 2024
1 parent d3412d2 commit 289c94e
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 11 deletions.
6 changes: 6 additions & 0 deletions camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ def set_config(self, key, value):
help="Characters that should be stripped from a string before"
" assigning it to a cell.",
)
@click.option(
"-replace",
"--replace_text",
help="Characters that should be replaced from a string before"
" assigning it to a cell.",
)
@click.option(
"-M",
"--margins",
Expand Down
3 changes: 3 additions & 0 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ def read_pdf(
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
Expand Down
6 changes: 6 additions & 0 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ class Lattice(BaseParser):
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
line_tol : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
lines.
Expand Down Expand Up @@ -99,6 +102,7 @@ def __init__(
split_text=False,
flag_size=False,
strip_text="",
replace_text={},
line_tol=2,
joint_tol=2,
threshold_blocksize=15,
Expand All @@ -117,6 +121,7 @@ def __init__(
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.replace_text = replace_text
self.line_tol = line_tol
self.joint_tol = joint_tol
self.threshold_blocksize = threshold_blocksize
Expand Down Expand Up @@ -360,6 +365,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
replace_text=self.replace_text,
)
if indices[0][:2] != (-1, -1):
pos_errors.append(error)
Expand Down
6 changes: 6 additions & 0 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class Stream(BaseParser):
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Expand All @@ -64,6 +67,7 @@ def __init__(
split_text=False,
flag_size=False,
strip_text="",
replace_text={},
edge_tol=50,
row_tol=2,
column_tol=0,
Expand All @@ -76,6 +80,7 @@ def __init__(
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.replace_text = replace_text
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
Expand Down Expand Up @@ -414,6 +419,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
replace_text=self.replace_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
Expand Down
75 changes: 64 additions & 11 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,12 +505,33 @@ def text_strip(text, strip=""):
return stripped


def text_replace(text, replace={}):
"""Replaces the keys for the values that are present in `text`.
Parameters
----------
text : str
Text to process and modify.
replace : dict, optional (default: {})
key value pairs, where keys are swapped for the values in `text`.
Returns
-------
text : str
"""
if replace is {}:
return text

for key, value in replace.items():
text = text.replace(key, value)

return text


# TODO: combine the following functions into a TextProcessor class which
# applies corresponding transformations sequentially
# (inspired from sklearn.pipeline.Pipeline)


def flag_font_size(textline, direction, strip_text=""):
def flag_font_size(textline, direction, strip_text="", replace_text={}):
"""Flags super/subscripts in text by enclosing them with <s></s>.
May give false positives.
Expand All @@ -523,6 +544,9 @@ def flag_font_size(textline, direction, strip_text=""):
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
Returns
-------
Expand Down Expand Up @@ -559,10 +583,13 @@ def flag_font_size(textline, direction, strip_text=""):
fstring = "".join(flist)
else:
fstring = "".join([t.get_text() for t in textline])
fstring = text_replace(fstring, replace_text)
return text_strip(fstring, strip_text)


def split_textline(table, textline, direction, flag_size=False, strip_text=""):
def split_textline(
table, textline, direction, flag_size=False, strip_text="", replace_text={}
):
"""Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.
Expand All @@ -580,6 +607,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
Returns
-------
Expand Down Expand Up @@ -668,20 +698,28 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
key[0],
key[1],
flag_font_size(
[t[2] for t in chars], direction, strip_text=strip_text
[t[2] for t in chars],
direction,
strip_text=strip_text,
replace_text=replace_text,
),
)
)
else:
gchars = [t[2].get_text() for t in chars]
grouped_chars.append(
(key[0], key[1], text_strip("".join(gchars), strip_text))
)
gchars = "".join([t[2].get_text() for t in chars])
gchars = text_replace(gchars, replace_text)
grouped_chars.append((key[0], key[1], text_strip(gchars, strip_text)))
return grouped_chars


def get_table_index(
table, t, direction, split_text=False, flag_size=False, strip_text=""
table,
t,
direction,
split_text=False,
flag_size=False,
strip_text="",
replace_text={},
):
"""Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates.
Expand All @@ -703,6 +741,9 @@ def get_table_index(
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
Returns
-------
Expand Down Expand Up @@ -761,7 +802,12 @@ def get_table_index(
if split_text:
return (
split_textline(
table, t, direction, flag_size=flag_size, strip_text=strip_text
table,
t,
direction,
flag_size=flag_size,
strip_text=strip_text,
replace_text=replace_text,
),
error,
)
Expand All @@ -772,13 +818,20 @@ def get_table_index(
(
r_idx,
c_idx,
flag_font_size(t._objs, direction, strip_text=strip_text),
flag_font_size(
t._objs,
direction,
strip_text=strip_text,
replace_text=replace_text,
),
)
],
error,
)
else:
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
text = t.get_text()
text = text_replace(text, replace_text)
return [(r_idx, c_idx, text_strip(text, strip_text))], error


def compute_accuracy(error_weights):
Expand Down
53 changes: 53 additions & 0 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2306,6 +2306,33 @@
["ChâteauLéoube2016", "10€"],
]

data_stream_replace_text = [
["VinsauVerre", ""],
["LesBlancs", "12.5CL"],
["A.O.PCôtesduRhône", ""],
["DomainedelaGuicharde«Autourdelachapelle»3316", "8$"],
["A.O.PVacqueyras", ""],
["DomainedeMontvac«Melodine»3316", "10$"],
["A.O.PChâteauneufduPape", ""],
["DomainedeBeaurenard3317", "13$"],
["A.O.PCôteauxduLanguedoc", ""],
["VillaTempora«Untempspourelle»3314", "9$"],
["A.O.PCôtesdeProvence", ""],
["ChâteauGrandBoise3317", "9$"],
["LesRosés", "125CL"],
["A.O.PCôtesduRhône", ""],
["DomainedelaFlorane«AfleurdePampre»3316", "8$"],
["FamilleCoulon(DomaineBeaurenard)Biotifulfox3317", "8$"],
["A.O.PVacqueyras", ""],
["DomainedeMontvac3317", "9$"],
["A.O.PLanguedoc", ""],
["DomainedeJoncas«Nébla»3315", "8$"],
["VillaTempora«L’arroseurarrosé»3315", "9$"],
["A.O.PCôtesdeProvence", ""],
["ChâteauGrandBoise«SainteVictoire»3317", "9$"],
["ChâteauLéoube3316", "10$"],
]

data_stream_edge_tol = [
["Key figures", ""],
["", "2016"],
Expand Down Expand Up @@ -2368,6 +2395,32 @@
["4171_1", "0.07", "173.9", "58.1%", "1.6%", "2.1%", "0.5%"],
]

data_lattice_text_replace = [
[
"Cycle \nName",
"KI \n(1/km)",
"Distance \n(mi)",
"Percent Fuel Savings",
"",
"",
"",
],
[
"",
"",
"",
"Improved \nSpeed",
"Decreased \nAccel",
"Eliminate \nStops",
"Decreased \nIdle",
],
["2012_2", "3,30", "1,3", "5,9%", "9,5%", "29,2%", "17,4%"],
["2145_1", "0,68", "11,2", "2,4%", "0,1%", "9,5%", "2,7%"],
["4234_1", "0,59", "58,7", "8,5%", "1,3%", "8,5%", "3,3%"],
["2032_2", "0,17", "57,8", "21,7%", "0,3%", "2,7%", "1,2%"],
["4171_1", "0,07", "173,9", "58,1%", "1,6%", "2,1%", "0,5%"],
]

data_lattice_table_rotated = [
[
"State",
Expand Down
11 changes: 11 additions & 0 deletions tests/test_lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@ def test_lattice(testdir):
assert_frame_equal(df, tables[0].df)


@skip_on_windows
def test_lattice_text_replace(testdir):
df = pd.DataFrame(data_lattice_text_replace)

filename = os.path.join(
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
)
tables = camelot.read_pdf(filename, pages="2", replace_text={".": ","})
assert_frame_equal(df, tables[0].df)


@skip_on_windows
def test_lattice_table_rotated(testdir):
df = pd.DataFrame(data_lattice_table_rotated)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,18 @@ def test_stream_strip_text(testdir):
tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
assert_frame_equal(df, tables[0].df)

def test_stream_replace_text(testdir):
df = pd.DataFrame(data_stream_replace_text)

filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(
filename,
flavor="stream",
strip_text=" ,\n",
replace_text={"€": "$", "20": "33"},
)

assert_frame_equal(df, tables[0].df)

def test_stream_edge_tol(testdir):
df = pd.DataFrame(data_stream_edge_tol)
Expand Down

0 comments on commit 289c94e

Please sign in to comment.