From b08bdc52e750290eee99373571a8427fa63e691b Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Fri, 22 Sep 2023 15:45:53 +1200 Subject: [PATCH] Replace german with german2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The website says about german2: In the sample German vocabulary of 35,000 words, the main stemmer and the variant stemmer exhibit about 90 differences. Of these about half are in words of foreign language origin (raphael, poesie etc). Of the native German words, about half seem to be improved by the variant stemming, and the other half made worse. In any case the differences are little more than one word per thousand among the native German words. I did my own comparison of the output from german and german2 on snowball-data/german/voc.txt which has 35033 entries (so appears to be the same "sample German vocabulary of 35,000 words"; also the only change to this file since it was added to version control in 2005 has been to convert it to UTF-8). Comparing the results from stemming this with german and german2, the first interesting thing is I get 77 different stems (rather than "about 90"). The algorithm has changed a little over time - there was an extra rule to handle "-nisse" in 2009 and a fix for a bug handing "qu" so that it matches the algorithm description. I undid these two algorithm changes and got 76 different stems. Maybe "90" was a typo for "80"? I don't have a better theory. I also noticed a significant proportion of foreign words, as well as some proper nouns. Some cases definitely seem improved, and quite a few are just different but effectively just change the stem for a word or group of words to a stem that isn't otherwise generated. Contrary to that quote from the website however, I didn't spot any differences I would classify as clearly worse, though there are some changes that have good and bad aspects to them. An example is that german2 changes "Bluet" (Allemanic German word for "blood") to stem to "blut" which is the same stem as "Blut" (German word for "blood"), so that seems beneficial. The downside is that "Blüte" ("blossom") stems to "blut" with both the german and german2 algorithms, but this "Blut"/"Blüte" conflation is an already present minor problem so I think overall I'd view the change to "Bluet" as neutral at worst. The replacing of umlauts with "e" suffixes is presumably much less common in newly created text than it once was as modern computer systems generally don't have the limitations which motivated this, but there will still be large amounts of legacy text so I think it makes sense to just replace german with german2. Fixes #92 --- GNUmakefile | 2 +- algorithms/german.sbl | 20 ++++-- algorithms/german2.sbl | 145 ----------------------------------------- libstemmer/modules.txt | 3 - 4 files changed, 14 insertions(+), 156 deletions(-) delete mode 100644 algorithms/german2.sbl diff --git a/GNUmakefile b/GNUmakefile index 7fcff3d8..5ce3b207 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -65,7 +65,7 @@ tarball_ext = .tar.gz # * KOI8_R_algorithms include algorithms.mk -other_algorithms = german2 kraaij_pohlmann lovins +other_algorithms = kraaij_pohlmann lovins all_algorithms = $(libstemmer_algorithms) $(other_algorithms) diff --git a/algorithms/german.sbl b/algorithms/german.sbl index 61f24ef9..cd303b15 100644 --- a/algorithms/german.sbl +++ b/algorithms/german.sbl @@ -32,16 +32,22 @@ define st_ending s_ending - 'r' define prelude as ( - test repeat ( - ( - ['{ss}'] <- 'ss' - ) or next - ) - - repeat goto ( + test repeat goto ( v [('u'] v <- 'U') or ('y'] v <- 'Y') ) + + repeat ( + [substring] among( + '{ss}' (<- 'ss') + 'ae' (<- '{a"}') + 'oe' (<- '{o"}') + 'ue' (<- '{u"}') + 'qu' () + '' (next) + ) + ) + ) define mark_regions as ( diff --git a/algorithms/german2.sbl b/algorithms/german2.sbl deleted file mode 100644 index cd303b15..00000000 --- a/algorithms/german2.sbl +++ /dev/null @@ -1,145 +0,0 @@ - -/* - Extra rule for -nisse ending added 11 Dec 2009 -*/ - -routines ( - prelude postlude - mark_regions - R1 R2 - standard_suffix -) - -externals ( stem ) - -integers ( p1 p2 x ) - -groupings ( v s_ending st_ending ) - -stringescapes {} - -/* special characters */ - -stringdef a" '{U+00E4}' -stringdef o" '{U+00F6}' -stringdef u" '{U+00FC}' -stringdef ss '{U+00DF}' - -define v 'aeiouy{a"}{o"}{u"}' - -define s_ending 'bdfghklmnrt' -define st_ending s_ending - 'r' - -define prelude as ( - - test repeat goto ( - v [('u'] v <- 'U') or - ('y'] v <- 'Y') - ) - - repeat ( - [substring] among( - '{ss}' (<- 'ss') - 'ae' (<- '{a"}') - 'oe' (<- '{o"}') - 'ue' (<- '{u"}') - 'qu' () - '' (next) - ) - ) - -) - -define mark_regions as ( - - $p1 = limit - $p2 = limit - - test(hop 3 setmark x) - - gopast v gopast non-v setmark p1 - try($p1 < x $p1 = x) // at least 3 - gopast v gopast non-v setmark p2 - -) - -define postlude as repeat ( - - [substring] among( - 'Y' (<- 'y') - 'U' (<- 'u') - '{a"}' (<- 'a') - '{o"}' (<- 'o') - '{u"}' (<- 'u') - '' (next) - ) - -) - -backwardmode ( - - define R1 as $p1 <= cursor - define R2 as $p2 <= cursor - - define standard_suffix as ( - do ( - [substring] R1 among( - 'em' 'ern' 'er' - ( delete - ) - 'e' 'en' 'es' - ( delete - try (['s'] 'nis' delete) - ) - 's' - ( s_ending delete - ) - ) - ) - do ( - [substring] R1 among( - 'en' 'er' 'est' - ( delete - ) - 'st' - ( st_ending hop 3 delete - ) - ) - ) - do ( - [substring] R2 among( - 'end' 'ung' - ( delete - try (['ig'] not 'e' R2 delete) - ) - 'ig' 'ik' 'isch' - ( not 'e' delete - ) - 'lich' 'heit' - ( delete - try ( - ['er' or 'en'] R1 delete - ) - ) - 'keit' - ( delete - try ( - [substring] R2 among( - 'lich' 'ig' - ( delete - ) - ) - ) - ) - ) - ) - ) -) - -define stem as ( - do prelude - do mark_regions - backwards - do standard_suffix - do postlude -) diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt index 61e23417..cd36a219 100644 --- a/libstemmer/modules.txt +++ b/libstemmer/modules.txt @@ -52,9 +52,6 @@ porter UTF_8,ISO_8859_1 porter english # intended for general use, and use of them is is not fully supported. These # algorithms are: # -# german2 - This is a slight modification of the german stemmer. -#german2 UTF_8,ISO_8859_1 german2 german -# # kraaij_pohlmann - This is a different dutch stemmer. #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch #