diff --git a/skylighting-core/src/Regex/KDE/Compile.hs b/skylighting-core/src/Regex/KDE/Compile.hs index f1ea286d..8cab12ab 100644 --- a/skylighting-core/src/Regex/KDE/Compile.hs +++ b/skylighting-core/src/Regex/KDE/Compile.hs @@ -179,6 +179,7 @@ pRegexEscapedChar = do 'S' -> return $ MatchChar (not . isSpace) 'w' -> return $ MatchChar isWordChar 'W' -> return $ MatchChar (not . isWordChar) + 'p' -> MatchChar <$> pUnicodeCharClass _ | c >= '0' && c <= '9' -> return $! MatchCaptured (ord c - ord '0') | otherwise -> mzero) <|> (MatchChar . (==) <$> pEscaped c) @@ -260,65 +261,69 @@ pRegexCharClass = do c <- getC (\d -> (\x -> x >= c && x <= d)) <$> (char '-' *> getC) <|> return (== c) - let getCClass = do -- character class \p{Lo} - _ <- A.string "\\p" - ds <- satisfy (== 123) *> A.takeWhile (/= 125) <* satisfy (== 125) - return $ - (case ds of - "Lu" -> (== UppercaseLetter) - "Ll" -> (== LowercaseLetter) - "Lt" -> (== TitlecaseLetter) - "Lm" -> (== ModifierLetter) - "Lo" -> (== OtherLetter) - "L" -> (\c -> c == UppercaseLetter || c == LowercaseLetter || - c == TitlecaseLetter || c == ModifierLetter || - c == OtherLetter) - "Mn" -> (== NonSpacingMark) - "Mc" -> (== SpacingCombiningMark) - "Me" -> (== EnclosingMark) - "M" -> (\c -> c == NonSpacingMark || c == SpacingCombiningMark || - c == EnclosingMark) - "Nd" -> (== DecimalNumber) - "Nl" -> (== LetterNumber) - "No" -> (== OtherNumber) - "N" -> (\c -> c == DecimalNumber || c == LetterNumber || - c == OtherNumber) - "Pc" -> (== ConnectorPunctuation) - "Pd" -> (== DashPunctuation) - "Ps" -> (== OpenPunctuation) - "Pe" -> (== ClosePunctuation) - "Pi" -> (== InitialQuote) - "Pf" -> (== FinalQuote) - "Po" -> (== OtherPunctuation) - "P" -> (\c -> c == ConnectorPunctuation || c == DashPunctuation || - c == OpenPunctuation || c == ClosePunctuation || - c == InitialQuote || c == FinalQuote || - c == OtherPunctuation) - "Sm" -> (== MathSymbol) - "Sc" -> (== CurrencySymbol) - "Sk" -> (== ModifierSymbol) - "So" -> (== OtherSymbol) - "S" -> (\c -> c == MathSymbol || c == CurrencySymbol || - c == ModifierSymbol || c == OtherSymbol) - "Zs" -> (== Space) - "Zl" -> (== LineSeparator) - "Zp" -> (== ParagraphSeparator) - "Z" -> (\c -> c == Space || c == LineSeparator || - c == ParagraphSeparator) - "Cc" -> (== Control) - "Cf" -> (== Format) - "Cs" -> (== Surrogate) - "Co" -> (== PrivateUse) - "Cn" -> (== NotAssigned) - "C" -> (\c -> c == Control || c == Format || c == Surrogate || - c == PrivateUse || c == NotAssigned) - _ -> (const False)) . generalCategory brack <- option [] $ [(==']')] <$ char ']' - fs <- many (getEscapedClass <|> getPosixClass <|> getCRange <|> getCClass) + fs <- many (getEscapedClass <|> getPosixClass <|> getCRange + <|> (A.string "\\p" *> pUnicodeCharClass)) _ <- satisfy (== 93) -- ] let f c = any ($ c) $ brack ++ fs return $! MatchChar (if negated then (not . f) else f) +-- character class \p{Lo}; we assume \p is already parsed +pUnicodeCharClass :: Parser (Char -> Bool) +pUnicodeCharClass = do + ds <- satisfy (== 123) *> A.takeWhile (/= 125) <* satisfy (== 125) + return $ + (case ds of + "Lu" -> (== UppercaseLetter) + "Ll" -> (== LowercaseLetter) + "Lt" -> (== TitlecaseLetter) + "Lm" -> (== ModifierLetter) + "Lo" -> (== OtherLetter) + "L" -> (\c -> c == UppercaseLetter || c == LowercaseLetter || + c == TitlecaseLetter || c == ModifierLetter || + c == OtherLetter) + "Mn" -> (== NonSpacingMark) + "Mc" -> (== SpacingCombiningMark) + "Me" -> (== EnclosingMark) + "M" -> (\c -> c == NonSpacingMark || c == SpacingCombiningMark || + c == EnclosingMark) + "Nd" -> (== DecimalNumber) + "Nl" -> (== LetterNumber) + "No" -> (== OtherNumber) + "N" -> (\c -> c == DecimalNumber || c == LetterNumber || + c == OtherNumber) + "Pc" -> (== ConnectorPunctuation) + "Pd" -> (== DashPunctuation) + "Ps" -> (== OpenPunctuation) + "Pe" -> (== ClosePunctuation) + "Pi" -> (== InitialQuote) + "Pf" -> (== FinalQuote) + "Po" -> (== OtherPunctuation) + "P" -> (\c -> c == ConnectorPunctuation || c == DashPunctuation || + c == OpenPunctuation || c == ClosePunctuation || + c == InitialQuote || c == FinalQuote || + c == OtherPunctuation) + "Sm" -> (== MathSymbol) + "Sc" -> (== CurrencySymbol) + "Sk" -> (== ModifierSymbol) + "So" -> (== OtherSymbol) + "S" -> (\c -> c == MathSymbol || c == CurrencySymbol || + c == ModifierSymbol || c == OtherSymbol) + "Zs" -> (== Space) + "Zl" -> (== LineSeparator) + "Zp" -> (== ParagraphSeparator) + "Z" -> (\c -> c == Space || c == LineSeparator || + c == ParagraphSeparator) + "Cc" -> (== Control) + "Cf" -> (== Format) + "Cs" -> (== Surrogate) + "Co" -> (== PrivateUse) + "Cn" -> (== NotAssigned) + "C" -> (\c -> c == Control || c == Format || c == Surrogate || + c == PrivateUse || c == NotAssigned) + _ -> (const False)) . generalCategory + + anyChar :: Parser Char anyChar = do w <- satisfy (const True) diff --git a/skylighting-core/test/test-skylighting.hs b/skylighting-core/test/test-skylighting.hs index 150cca62..ccfe643d 100644 --- a/skylighting-core/test/test-skylighting.hs +++ b/skylighting-core/test/test-skylighting.hs @@ -343,6 +343,8 @@ regexTests = , ("a|\\((?0)\\)", "(((a)))", Just ("(((a)))", [])) , ("([abc](x(?1))*)", "axbxcc", Just ("axbxc", [(1,"axbxc"),(2,"xc")])) -- note: pcre gives insetad (2, "xbxc") -- I don't understand why + , ("[\\p{Nd}]", "33", Just ("3", [])) + , ("\\p{N}", "33", Just ("3", [])) ]