From b08bdc52e750290eee99373571a8427fa63e691b Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Fri, 22 Sep 2023 15:45:53 +1200
Subject: [PATCH] Replace german with german2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The website says about german2:

  In the sample German vocabulary of 35,000 words, the main stemmer and
  the variant stemmer exhibit about 90 differences. Of these about half
  are in words of foreign language origin (raphael, poesie etc). Of the
  native German words, about half seem to be improved by the variant
  stemming, and the other half made worse. In any case the differences
  are little more than one word per thousand among the native German
  words.

I did my own comparison of the output from german and german2 on
snowball-data/german/voc.txt which has 35033 entries (so appears to be
the same "sample German vocabulary of 35,000 words"; also the only
change to this file since it was added to version control in 2005 has
been to convert it to UTF-8).

Comparing the results from stemming this with german and german2, the
first interesting thing is I get 77 different stems (rather than "about
90"). The algorithm has changed a little over time - there was an extra
rule to handle "-nisse" in 2009 and a fix for a bug handing "qu" so that
it matches the algorithm description. I undid these two algorithm
changes and got 76 different stems. Maybe "90" was a typo for "80"? I
don't have a better theory.

I also noticed a significant proportion of foreign words, as well as
some proper nouns. Some cases definitely seem improved, and quite a few
are just different but effectively just change the stem for a word or
group of words to a stem that isn't otherwise generated. Contrary to
that quote from the website however, I didn't spot any differences I
would classify as clearly worse, though there are some changes that have
good and bad aspects to them.

An example is that german2 changes "Bluet" (Allemanic German word for
"blood") to stem to "blut" which is the same stem as "Blut" (German word
for "blood"), so that seems beneficial. The downside is that "Blüte"
("blossom") stems to "blut" with both the german and german2 algorithms,
but this "Blut"/"Blüte" conflation is an already present minor problem
so I think overall I'd view the change to "Bluet" as neutral at worst.

The replacing of umlauts with "e" suffixes is presumably much less
common in newly created text than it once was as modern computer systems
generally don't have the limitations which motivated this, but there
will still be large amounts of legacy text so I think it makes sense to
just replace german with german2.

Fixes #92
---
 GNUmakefile            |   2 +-
 algorithms/german.sbl  |  20 ++++--
 algorithms/german2.sbl | 145 -----------------------------------------
 libstemmer/modules.txt |   3 -
 4 files changed, 14 insertions(+), 156 deletions(-)
 delete mode 100644 algorithms/german2.sbl

diff --git a/GNUmakefile b/GNUmakefile
index 7fcff3d8..5ce3b207 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -65,7 +65,7 @@ tarball_ext = .tar.gz
 # * KOI8_R_algorithms
 include algorithms.mk
 
-other_algorithms = german2 kraaij_pohlmann lovins
+other_algorithms = kraaij_pohlmann lovins
 
 all_algorithms = $(libstemmer_algorithms) $(other_algorithms)
 
diff --git a/algorithms/german.sbl b/algorithms/german.sbl
index 61f24ef9..cd303b15 100644
--- a/algorithms/german.sbl
+++ b/algorithms/german.sbl
@@ -32,16 +32,22 @@ define st_ending s_ending - 'r'
 
 define prelude as (
 
-    test repeat (
-        (
-            ['{ss}'] <- 'ss'
-        ) or next
-    )
-
-    repeat goto (
+    test repeat goto (
         v [('u'] v <- 'U') or
            ('y'] v <- 'Y')
     )
+
+    repeat (
+        [substring] among(
+            '{ss}' (<- 'ss')
+            'ae'   (<- '{a"}')
+            'oe'   (<- '{o"}')
+            'ue'   (<- '{u"}')
+            'qu'   ()
+            ''     (next)
+        )
+    )
+
 )
 
 define mark_regions as (
diff --git a/algorithms/german2.sbl b/algorithms/german2.sbl
deleted file mode 100644
index cd303b15..00000000
--- a/algorithms/german2.sbl
+++ /dev/null
@@ -1,145 +0,0 @@
-
-/*
-    Extra rule for -nisse ending added 11 Dec 2009
-*/
-
-routines (
-           prelude postlude
-           mark_regions
-           R1 R2
-           standard_suffix
-)
-
-externals ( stem )
-
-integers ( p1 p2 x )
-
-groupings ( v s_ending st_ending )
-
-stringescapes {}
-
-/* special characters */
-
-stringdef a"   '{U+00E4}'
-stringdef o"   '{U+00F6}'
-stringdef u"   '{U+00FC}'
-stringdef ss   '{U+00DF}'
-
-define v 'aeiouy{a"}{o"}{u"}'
-
-define s_ending  'bdfghklmnrt'
-define st_ending s_ending - 'r'
-
-define prelude as (
-
-    test repeat goto (
-        v [('u'] v <- 'U') or
-           ('y'] v <- 'Y')
-    )
-
-    repeat (
-        [substring] among(
-            '{ss}' (<- 'ss')
-            'ae'   (<- '{a"}')
-            'oe'   (<- '{o"}')
-            'ue'   (<- '{u"}')
-            'qu'   ()
-            ''     (next)
-        )
-    )
-
-)
-
-define mark_regions as (
-
-    $p1 = limit
-    $p2 = limit
-
-    test(hop 3 setmark x)
-
-    gopast v  gopast non-v  setmark p1
-    try($p1 < x  $p1 = x)  // at least 3
-    gopast v  gopast non-v  setmark p2
-
-)
-
-define postlude as repeat (
-
-    [substring] among(
-        'Y'    (<- 'y')
-        'U'    (<- 'u')
-        '{a"}' (<- 'a')
-        '{o"}' (<- 'o')
-        '{u"}' (<- 'u')
-        ''     (next)
-    )
-
-)
-
-backwardmode (
-
-    define R1 as $p1 <= cursor
-    define R2 as $p2 <= cursor
-
-    define standard_suffix as (
-        do (
-            [substring] R1 among(
-                'em' 'ern' 'er'
-                (   delete
-                )
-                'e' 'en' 'es'
-                (   delete
-                    try (['s'] 'nis' delete)
-                )
-                's'
-                (   s_ending delete
-                )
-            )
-        )
-        do (
-            [substring] R1 among(
-                'en' 'er' 'est'
-                (   delete
-                )
-                'st'
-                (   st_ending hop 3 delete
-                )
-            )
-        )
-        do (
-            [substring] R2 among(
-                'end' 'ung'
-                (   delete
-                    try (['ig'] not 'e' R2 delete)
-                )
-                'ig' 'ik' 'isch'
-                (   not 'e' delete
-                )
-                'lich' 'heit'
-                (   delete
-                    try (
-                        ['er' or 'en'] R1 delete
-                    )
-                )
-                'keit'
-                (   delete
-                    try (
-                        [substring] R2 among(
-                            'lich' 'ig'
-                            (   delete
-                            )
-                        )
-                    )
-                )
-            )
-        )
-    )
-)
-
-define stem as (
-    do prelude
-    do mark_regions
-    backwards
-        do standard_suffix
-    do postlude
-)
diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt
index 61e23417..cd36a219 100644
--- a/libstemmer/modules.txt
+++ b/libstemmer/modules.txt
@@ -52,9 +52,6 @@ porter          UTF_8,ISO_8859_1        porter			english
 # intended for general use, and use of them is is not fully supported.  These
 # algorithms are:
 #
-# german2          - This is a slight modification of the german stemmer.
-#german2          UTF_8,ISO_8859_1        german2		german
-#
 # kraaij_pohlmann  - This is a different dutch stemmer.
 #kraaij_pohlmann  UTF_8,ISO_8859_1        kraaij_pohlmann	dutch
 #