Regex: allow unicode char classes outside of [..].

jgm · Dec 2, 2023 · be241d7 · be241d7
1 parent 37a2e98
commit be241d7
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 54 deletions.
diff --git a/skylighting-core/src/Regex/KDE/Compile.hs b/skylighting-core/src/Regex/KDE/Compile.hs
@@ -179,6 +179,7 @@ pRegexEscapedChar = do
     'S' -> return $ MatchChar (not . isSpace)
     'w' -> return $ MatchChar isWordChar
     'W' -> return $ MatchChar (not . isWordChar)
+    'p' -> MatchChar <$> pUnicodeCharClass
     _ | c >= '0' && c <= '9' ->
        return $! MatchCaptured (ord c - ord '0')
       | otherwise -> mzero) <|> (MatchChar . (==) <$> pEscaped c)
@@ -260,65 +261,69 @@ pRegexCharClass = do
         c <- getC
         (\d -> (\x -> x >= c && x <= d)) <$> (char '-' *> getC) <|>
           return (== c)
-  let getCClass = do -- character class \p{Lo}
-        _ <- A.string "\\p"
-        ds <- satisfy (== 123) *> A.takeWhile (/= 125) <* satisfy (== 125)
-        return $
-          (case ds of
-            "Lu" -> (== UppercaseLetter)
-            "Ll" -> (== LowercaseLetter)
-            "Lt" -> (== TitlecaseLetter)
-            "Lm" -> (== ModifierLetter)
-            "Lo" -> (== OtherLetter)
-            "L" -> (\c -> c == UppercaseLetter || c == LowercaseLetter ||
-                          c == TitlecaseLetter || c == ModifierLetter ||
-                          c == OtherLetter)
-            "Mn" -> (== NonSpacingMark)
-            "Mc" -> (== SpacingCombiningMark)
-            "Me" -> (== EnclosingMark)
-            "M" -> (\c -> c == NonSpacingMark || c == SpacingCombiningMark ||
-                          c == EnclosingMark)
-            "Nd" -> (== DecimalNumber)
-            "Nl" -> (== LetterNumber)
-            "No" -> (== OtherNumber)
-            "N" -> (\c -> c == DecimalNumber || c == LetterNumber ||
-                          c == OtherNumber)
-            "Pc" -> (== ConnectorPunctuation)
-            "Pd" -> (== DashPunctuation)
-            "Ps" -> (== OpenPunctuation)
-            "Pe" -> (== ClosePunctuation)
-            "Pi" -> (== InitialQuote)
-            "Pf" -> (== FinalQuote)
-            "Po" -> (== OtherPunctuation)
-            "P" -> (\c -> c == ConnectorPunctuation || c == DashPunctuation ||
-                          c == OpenPunctuation || c == ClosePunctuation ||
-                          c == InitialQuote || c == FinalQuote ||
-                          c == OtherPunctuation)
-            "Sm" -> (== MathSymbol)
-            "Sc" -> (== CurrencySymbol)
-            "Sk" -> (== ModifierSymbol)
-            "So" -> (== OtherSymbol)
-            "S" -> (\c -> c == MathSymbol || c == CurrencySymbol ||
-                          c == ModifierSymbol || c == OtherSymbol)
-            "Zs" -> (== Space)
-            "Zl" -> (== LineSeparator)
-            "Zp" -> (== ParagraphSeparator)
-            "Z" -> (\c -> c == Space || c == LineSeparator ||
-                          c == ParagraphSeparator)
-            "Cc" -> (== Control)
-            "Cf" -> (== Format)
-            "Cs" -> (== Surrogate)
-            "Co" -> (== PrivateUse)
-            "Cn" -> (== NotAssigned)
-            "C" -> (\c -> c == Control || c == Format || c == Surrogate ||
-                          c == PrivateUse || c == NotAssigned)
-            _    -> (const False)) . generalCategory
   brack <- option [] $ [(==']')] <$ char ']'
-  fs <- many (getEscapedClass <|> getPosixClass <|> getCRange <|> getCClass)
+  fs <- many (getEscapedClass <|> getPosixClass <|> getCRange
+              <|> (A.string "\\p" *> pUnicodeCharClass))
   _ <- satisfy (== 93) -- ]
   let f c = any ($ c) $ brack ++ fs
   return $! MatchChar (if negated then (not . f) else f)
 
+-- character class \p{Lo}; we assume \p is already parsed
+pUnicodeCharClass :: Parser (Char -> Bool)
+pUnicodeCharClass = do
+  ds <- satisfy (== 123) *> A.takeWhile (/= 125) <* satisfy (== 125)
+  return $
+    (case ds of
+      "Lu" -> (== UppercaseLetter)
+      "Ll" -> (== LowercaseLetter)
+      "Lt" -> (== TitlecaseLetter)
+      "Lm" -> (== ModifierLetter)
+      "Lo" -> (== OtherLetter)
+      "L" -> (\c -> c == UppercaseLetter || c == LowercaseLetter ||
+                    c == TitlecaseLetter || c == ModifierLetter ||
+                    c == OtherLetter)
+      "Mn" -> (== NonSpacingMark)
+      "Mc" -> (== SpacingCombiningMark)
+      "Me" -> (== EnclosingMark)
+      "M" -> (\c -> c == NonSpacingMark || c == SpacingCombiningMark ||
+                    c == EnclosingMark)
+      "Nd" -> (== DecimalNumber)
+      "Nl" -> (== LetterNumber)
+      "No" -> (== OtherNumber)
+      "N" -> (\c -> c == DecimalNumber || c == LetterNumber ||
+                    c == OtherNumber)
+      "Pc" -> (== ConnectorPunctuation)
+      "Pd" -> (== DashPunctuation)
+      "Ps" -> (== OpenPunctuation)
+      "Pe" -> (== ClosePunctuation)
+      "Pi" -> (== InitialQuote)
+      "Pf" -> (== FinalQuote)
+      "Po" -> (== OtherPunctuation)
+      "P" -> (\c -> c == ConnectorPunctuation || c == DashPunctuation ||
+                    c == OpenPunctuation || c == ClosePunctuation ||
+                    c == InitialQuote || c == FinalQuote ||
+                    c == OtherPunctuation)
+      "Sm" -> (== MathSymbol)
+      "Sc" -> (== CurrencySymbol)
+      "Sk" -> (== ModifierSymbol)
+      "So" -> (== OtherSymbol)
+      "S" -> (\c -> c == MathSymbol || c == CurrencySymbol ||
+                    c == ModifierSymbol || c == OtherSymbol)
+      "Zs" -> (== Space)
+      "Zl" -> (== LineSeparator)
+      "Zp" -> (== ParagraphSeparator)
+      "Z" -> (\c -> c == Space || c == LineSeparator ||
+                    c == ParagraphSeparator)
+      "Cc" -> (== Control)
+      "Cf" -> (== Format)
+      "Cs" -> (== Surrogate)
+      "Co" -> (== PrivateUse)
+      "Cn" -> (== NotAssigned)
+      "C" -> (\c -> c == Control || c == Format || c == Surrogate ||
+                    c == PrivateUse || c == NotAssigned)
+      _    -> (const False)) . generalCategory
+
+
 anyChar :: Parser Char
 anyChar = do
   w <- satisfy (const True)

diff --git a/skylighting-core/test/test-skylighting.hs b/skylighting-core/test/test-skylighting.hs
@@ -343,6 +343,8 @@ regexTests =
   , ("a|\\((?0)\\)", "(((a)))", Just ("(((a)))", []))
   , ("([abc](x(?1))*)", "axbxcc", Just ("axbxc", [(1,"axbxc"),(2,"xc")]))
     -- note: pcre gives insetad (2, "xbxc") -- I don't understand why
+  , ("[\\p{Nd}]", "33", Just ("3", []))
+  , ("\\p{N}", "33", Just ("3", []))
   ]