Skip to content

Commit

Permalink
style(whitespaces in text/new lines): Make text prettier (#447)
Browse files Browse the repository at this point in the history
* WIP

* Added comments

* fixed test

* make know suffixes set

* updated snapshots
  • Loading branch information
DeltaDaniel authored Sep 23, 2024
1 parent 2456f60 commit 2b94ffd
Show file tree
Hide file tree
Showing 3 changed files with 2,561 additions and 2,541 deletions.
28 changes: 24 additions & 4 deletions src/kohlrahbi/docxtablecells/bodycell.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@
from kohlrahbi.table_header import get_tabstop_positions

INDEX_OF_CODES_AND_QUALIFIER_COLUMN = 4
KNOW_SUFFIXES = {
"g",
"ung",
"gs-",
"vall",
"n",
"m",
"t",
"rage",
"sgrund",
} # only a temporary and incomplete list to filter some cases, not intended as NLP


class BodyCell(BaseModel):
Expand Down Expand Up @@ -38,6 +49,13 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
tabstop_positions (list[int]): All tabstop positions of the indicator middle cell
"""

def add_text_to_column(row_index: int, column_index: int, text: str) -> None:
starts_with_known_suffix = any(text.startswith(suffix) for suffix in KNOW_SUFFIXES)
if len(text) > 0:
if len(ahb_row_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
text = " " + text
ahb_row_dataframe.iat[row_index, column_index] += text

def handle_code_or_qualifier_entry(
splitted_text_at_tabs: list[str], row_index: int, is_first_iteration: bool
) -> int:
Expand All @@ -48,7 +66,7 @@ def handle_code_or_qualifier_entry(
if not is_first_iteration:
ahb_row_dataframe.loc[ahb_row_dataframe.index.max() + 1, :] = ""
row_index += 1
ahb_row_dataframe.iat[row_index, INDEX_OF_CODES_AND_QUALIFIER_COLUMN] += splitted_text_at_tabs.pop(0)
add_text_to_column(row_index, INDEX_OF_CODES_AND_QUALIFIER_COLUMN, splitted_text_at_tabs.pop(0))
return row_index

def handle_tab_stops(
Expand All @@ -59,14 +77,16 @@ def handle_tab_stops(
for indicator_tabstop_position, column_index in zip(self.indicator_tabstop_positions, column_indezes):
if len(tab_stops_in_current_paragraph) == 1:
if indicator_tabstop_position in (tabstop, paragraph.paragraph_format.left_indent):
ahb_row_dataframe.iat[row_index, column_index] += splitted_text_at_tabs.pop(0)
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
else:
if tabstop == indicator_tabstop_position:
ahb_row_dataframe.iat[row_index, column_index] += splitted_text_at_tabs.pop(0)
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))

def handle_no_tab_stops(splitted_text_at_tabs: list[str], row_index: int) -> None:
if splitted_text_at_tabs:
ahb_row_dataframe.at[row_index, "Beschreibung"] += splitted_text_at_tabs.pop(0)
column_index = ahb_row_dataframe.columns.get_loc("Beschreibung")
assert isinstance(column_index, int)
add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))

cell_is_empty = self.table_cell.paragraphs[0].text == ""
if cell_is_empty:
Expand Down
Loading

0 comments on commit 2b94ffd

Please sign in to comment.