style(whitespaces in text/new lines): Make text prettier (#447)

* WIP * Added comments * fixed test * make know suffixes set * updated snapshots
Hochfrequenz · Sep 23, 2024 · 2b94ffd · 2b94ffd
1 parent 2456f60
commit 2b94ffd
Show file tree

Hide file tree

Showing 3 changed files with 2,561 additions and 2,541 deletions.
diff --git a/src/kohlrahbi/docxtablecells/bodycell.py b/src/kohlrahbi/docxtablecells/bodycell.py
@@ -11,6 +11,17 @@
 from kohlrahbi.table_header import get_tabstop_positions
 
 INDEX_OF_CODES_AND_QUALIFIER_COLUMN = 4
+KNOW_SUFFIXES = {
+    "g",
+    "ung",
+    "gs-",
+    "vall",
+    "n",
+    "m",
+    "t",
+    "rage",
+    "sgrund",
+}  # only a temporary and incomplete list to  filter some cases, not intended as NLP
 
 
 class BodyCell(BaseModel):
@@ -38,6 +49,13 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
             tabstop_positions (list[int]): All tabstop positions of the indicator middle cell
         """
 
+        def add_text_to_column(row_index: int, column_index: int, text: str) -> None:
+            starts_with_known_suffix = any(text.startswith(suffix) for suffix in KNOW_SUFFIXES)
+            if len(text) > 0:
+                if len(ahb_row_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
+                    text = " " + text
+                ahb_row_dataframe.iat[row_index, column_index] += text
+
         def handle_code_or_qualifier_entry(
             splitted_text_at_tabs: list[str], row_index: int, is_first_iteration: bool
         ) -> int:
@@ -48,7 +66,7 @@ def handle_code_or_qualifier_entry(
                 if not is_first_iteration:
                     ahb_row_dataframe.loc[ahb_row_dataframe.index.max() + 1, :] = ""
                     row_index += 1
-            ahb_row_dataframe.iat[row_index, INDEX_OF_CODES_AND_QUALIFIER_COLUMN] += splitted_text_at_tabs.pop(0)
+            add_text_to_column(row_index, INDEX_OF_CODES_AND_QUALIFIER_COLUMN, splitted_text_at_tabs.pop(0))
             return row_index
 
         def handle_tab_stops(
@@ -59,14 +77,16 @@ def handle_tab_stops(
                 for indicator_tabstop_position, column_index in zip(self.indicator_tabstop_positions, column_indezes):
                     if len(tab_stops_in_current_paragraph) == 1:
                         if indicator_tabstop_position in (tabstop, paragraph.paragraph_format.left_indent):
-                            ahb_row_dataframe.iat[row_index, column_index] += splitted_text_at_tabs.pop(0)
+                            add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
                     else:
                         if tabstop == indicator_tabstop_position:
-                            ahb_row_dataframe.iat[row_index, column_index] += splitted_text_at_tabs.pop(0)
+                            add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
 
         def handle_no_tab_stops(splitted_text_at_tabs: list[str], row_index: int) -> None:
             if splitted_text_at_tabs:
-                ahb_row_dataframe.at[row_index, "Beschreibung"] += splitted_text_at_tabs.pop(0)
+                column_index = ahb_row_dataframe.columns.get_loc("Beschreibung")
+                assert isinstance(column_index, int)
+                add_text_to_column(row_index, column_index, splitted_text_at_tabs.pop(0))
 
         cell_is_empty = self.table_cell.paragraphs[0].text == ""
         if cell_is_empty: