Skip to content

Commit

Permalink
Merge branch 'romanian_comma_forms'
Browse files Browse the repository at this point in the history
  • Loading branch information
ojwb committed Aug 7, 2023
2 parents ca6abf3 + 122ee93 commit b849d3a
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 14 deletions.
4 changes: 2 additions & 2 deletions algorithms/catalan.sbl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ stringescapes {}

stringdef a' '{U+00E1}' // a-acute
stringdef a` '{U+00E0}' // a-grave
stringdef c, '{U+00E7}' // c-cedilla
stringdef cc '{U+00E7}' // c-cedilla
stringdef e' '{U+00E9}' // e-acute
stringdef e` '{U+00E8}' // e-grave
stringdef i' '{U+00ED}' // i-acute
Expand Down Expand Up @@ -90,7 +90,7 @@ backwardmode (
'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius'
'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste'
'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis'
'{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all'
'{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{cc}a' 'nces' '{o'}' 'dor' 'all'
'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu'
'{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar'
'itar' 'ables' 'adors' 'idores' 'idors'
Expand Down
4 changes: 2 additions & 2 deletions algorithms/french.sbl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ stringescapes {}

stringdef a^ '{U+00E2}' // a-circumflex
stringdef a` '{U+00E0}' // a-grave
stringdef c, '{U+00E7}' // c-cedilla
stringdef cc '{U+00E7}' // c-cedilla

stringdef e" '{U+00EB}' // e-diaeresis (rare)
stringdef e' '{U+00E9}' // e-acute
Expand Down Expand Up @@ -238,7 +238,7 @@ define stem as (
)
and
try( [ ('Y' ] <- 'i' ) or
('{c,}'] <- 'c' )
('{cc}'] <- 'c' )
)
) or
residual_suffix
Expand Down
10 changes: 5 additions & 5 deletions algorithms/portuguese.sbl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ stringdef i' '{U+00ED}' // i-acute
stringdef o^ '{U+00F4}' // o-circumflex
stringdef o' '{U+00F3}' // o-acute
stringdef u' '{U+00FA}' // u-acute
stringdef c, '{U+00E7}' // c-cedilla
stringdef cc '{U+00E7}' // c-cedilla

stringdef a~ '{U+00E3}' // a-tilde
stringdef o~ '{U+00F5}' // o-tilde
Expand Down Expand Up @@ -86,8 +86,8 @@ backwardmode (
'amento' 'amentos'
'imento' 'imentos'

'adora' 'ador' 'a{c,}a~o'
'adoras' 'adores' 'a{c,}o~es' // no -ic test
'adora' 'ador' 'a{cc}a~o'
'adoras' 'adores' 'a{cc}o~es' // no -ic test
'ante' 'antes' '{a^}ncia' // Note 1
(
R2 delete
Expand All @@ -97,7 +97,7 @@ backwardmode (
(
R2 <- 'log'
)
'u{c,}a~o' 'u{c,}o~es'
'u{cc}a~o' 'u{cc}o~es'
(
R2 <- 'u'
)
Expand Down Expand Up @@ -193,7 +193,7 @@ backwardmode (
'e' '{e'}' '{e^}'
( RV delete [('u'] test 'g') or
('i'] test 'c') RV delete )
'{c,}' (<-'c')
'{cc}' (<-'c')
)
)
)
Expand Down
18 changes: 16 additions & 2 deletions algorithms/romanian.sbl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

routines (
norm
prelude postlude mark_regions
RV R1 R2
step_0
Expand All @@ -23,11 +24,23 @@ stringescapes {}
stringdef a^ '{U+00E2}' // a circumflex
stringdef i^ '{U+00EE}' // i circumflex
stringdef a+ '{U+0103}' // a breve
stringdef s, '{U+015F}' // s cedilla
stringdef t, '{U+0163}' // t cedilla
stringdef sc '{U+015F}' // s cedilla
stringdef tc '{U+0163}' // t cedilla
stringdef s, '{U+0219}' // s comma
stringdef t, '{U+021B}' // t comma

define v 'aeiou{a^}{i^}{a+}'

// Normalize old cedilla forms to correct comma-below forms.
define norm as (
do repeat goto (
[substring] among (
'{sc}' (<- '{s,}')
'{tc}' (<- '{t,}')
)
)
)

define prelude as (
repeat goto (
v [ ('u' ] v <- 'U') or
Expand Down Expand Up @@ -223,6 +236,7 @@ backwardmode (
)

define stem as (
do norm
do prelude
do mark_regions
backwards (
Expand Down
4 changes: 2 additions & 2 deletions algorithms/turkish.sbl
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ routines (
stringescapes { }

/* Special characters in Unicode Latin-1 and Latin Extended-A */
stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA
stringdef cc '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA
stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE
stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT
stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS
Expand Down Expand Up @@ -413,7 +413,7 @@ backwardmode (
define post_process_last_consonants as (
[substring] among (
'b' (<- 'p')
'c' (<- '{c,}')
'c' (<- '{cc}')
'd' (<- 't')
'{g~}' (<- 'k')
)
Expand Down
2 changes: 1 addition & 1 deletion libstemmer/modules.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ lithuanian UTF_8 lithuanian,lt,lit
nepali UTF_8 nepali,ne,nep
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
romanian UTF_8 romanian,ro,rum,ron
russian UTF_8,KOI8_R russian,ru,rus
serbian UTF_8 serbian,sr,srp
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
Expand Down

0 comments on commit b849d3a

Please sign in to comment.