Skip to content

Commit

Permalink
Regex: allow unicode char classes outside of [..].
Browse files Browse the repository at this point in the history
  • Loading branch information
jgm committed Dec 2, 2023
1 parent 37a2e98 commit be241d7
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 54 deletions.
113 changes: 59 additions & 54 deletions skylighting-core/src/Regex/KDE/Compile.hs
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ pRegexEscapedChar = do
'S' -> return $ MatchChar (not . isSpace)
'w' -> return $ MatchChar isWordChar
'W' -> return $ MatchChar (not . isWordChar)
'p' -> MatchChar <$> pUnicodeCharClass
_ | c >= '0' && c <= '9' ->
return $! MatchCaptured (ord c - ord '0')
| otherwise -> mzero) <|> (MatchChar . (==) <$> pEscaped c)
Expand Down Expand Up @@ -260,65 +261,69 @@ pRegexCharClass = do
c <- getC
(\d -> (\x -> x >= c && x <= d)) <$> (char '-' *> getC) <|>
return (== c)
let getCClass = do -- character class \p{Lo}
_ <- A.string "\\p"
ds <- satisfy (== 123) *> A.takeWhile (/= 125) <* satisfy (== 125)
return $
(case ds of
"Lu" -> (== UppercaseLetter)
"Ll" -> (== LowercaseLetter)
"Lt" -> (== TitlecaseLetter)
"Lm" -> (== ModifierLetter)
"Lo" -> (== OtherLetter)
"L" -> (\c -> c == UppercaseLetter || c == LowercaseLetter ||
c == TitlecaseLetter || c == ModifierLetter ||
c == OtherLetter)
"Mn" -> (== NonSpacingMark)
"Mc" -> (== SpacingCombiningMark)
"Me" -> (== EnclosingMark)
"M" -> (\c -> c == NonSpacingMark || c == SpacingCombiningMark ||
c == EnclosingMark)
"Nd" -> (== DecimalNumber)
"Nl" -> (== LetterNumber)
"No" -> (== OtherNumber)
"N" -> (\c -> c == DecimalNumber || c == LetterNumber ||
c == OtherNumber)
"Pc" -> (== ConnectorPunctuation)
"Pd" -> (== DashPunctuation)
"Ps" -> (== OpenPunctuation)
"Pe" -> (== ClosePunctuation)
"Pi" -> (== InitialQuote)
"Pf" -> (== FinalQuote)
"Po" -> (== OtherPunctuation)
"P" -> (\c -> c == ConnectorPunctuation || c == DashPunctuation ||
c == OpenPunctuation || c == ClosePunctuation ||
c == InitialQuote || c == FinalQuote ||
c == OtherPunctuation)
"Sm" -> (== MathSymbol)
"Sc" -> (== CurrencySymbol)
"Sk" -> (== ModifierSymbol)
"So" -> (== OtherSymbol)
"S" -> (\c -> c == MathSymbol || c == CurrencySymbol ||
c == ModifierSymbol || c == OtherSymbol)
"Zs" -> (== Space)
"Zl" -> (== LineSeparator)
"Zp" -> (== ParagraphSeparator)
"Z" -> (\c -> c == Space || c == LineSeparator ||
c == ParagraphSeparator)
"Cc" -> (== Control)
"Cf" -> (== Format)
"Cs" -> (== Surrogate)
"Co" -> (== PrivateUse)
"Cn" -> (== NotAssigned)
"C" -> (\c -> c == Control || c == Format || c == Surrogate ||
c == PrivateUse || c == NotAssigned)
_ -> (const False)) . generalCategory
brack <- option [] $ [(==']')] <$ char ']'
fs <- many (getEscapedClass <|> getPosixClass <|> getCRange <|> getCClass)
fs <- many (getEscapedClass <|> getPosixClass <|> getCRange
<|> (A.string "\\p" *> pUnicodeCharClass))
_ <- satisfy (== 93) -- ]
let f c = any ($ c) $ brack ++ fs
return $! MatchChar (if negated then (not . f) else f)

-- character class \p{Lo}; we assume \p is already parsed
pUnicodeCharClass :: Parser (Char -> Bool)
pUnicodeCharClass = do
ds <- satisfy (== 123) *> A.takeWhile (/= 125) <* satisfy (== 125)
return $
(case ds of
"Lu" -> (== UppercaseLetter)
"Ll" -> (== LowercaseLetter)
"Lt" -> (== TitlecaseLetter)
"Lm" -> (== ModifierLetter)
"Lo" -> (== OtherLetter)
"L" -> (\c -> c == UppercaseLetter || c == LowercaseLetter ||
c == TitlecaseLetter || c == ModifierLetter ||
c == OtherLetter)
"Mn" -> (== NonSpacingMark)
"Mc" -> (== SpacingCombiningMark)
"Me" -> (== EnclosingMark)
"M" -> (\c -> c == NonSpacingMark || c == SpacingCombiningMark ||
c == EnclosingMark)
"Nd" -> (== DecimalNumber)
"Nl" -> (== LetterNumber)
"No" -> (== OtherNumber)
"N" -> (\c -> c == DecimalNumber || c == LetterNumber ||
c == OtherNumber)
"Pc" -> (== ConnectorPunctuation)
"Pd" -> (== DashPunctuation)
"Ps" -> (== OpenPunctuation)
"Pe" -> (== ClosePunctuation)
"Pi" -> (== InitialQuote)
"Pf" -> (== FinalQuote)
"Po" -> (== OtherPunctuation)
"P" -> (\c -> c == ConnectorPunctuation || c == DashPunctuation ||
c == OpenPunctuation || c == ClosePunctuation ||
c == InitialQuote || c == FinalQuote ||
c == OtherPunctuation)
"Sm" -> (== MathSymbol)
"Sc" -> (== CurrencySymbol)
"Sk" -> (== ModifierSymbol)
"So" -> (== OtherSymbol)
"S" -> (\c -> c == MathSymbol || c == CurrencySymbol ||
c == ModifierSymbol || c == OtherSymbol)
"Zs" -> (== Space)
"Zl" -> (== LineSeparator)
"Zp" -> (== ParagraphSeparator)
"Z" -> (\c -> c == Space || c == LineSeparator ||
c == ParagraphSeparator)
"Cc" -> (== Control)
"Cf" -> (== Format)
"Cs" -> (== Surrogate)
"Co" -> (== PrivateUse)
"Cn" -> (== NotAssigned)
"C" -> (\c -> c == Control || c == Format || c == Surrogate ||
c == PrivateUse || c == NotAssigned)
_ -> (const False)) . generalCategory


anyChar :: Parser Char
anyChar = do
w <- satisfy (const True)
Expand Down
2 changes: 2 additions & 0 deletions skylighting-core/test/test-skylighting.hs
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,8 @@ regexTests =
, ("a|\\((?0)\\)", "(((a)))", Just ("(((a)))", []))
, ("([abc](x(?1))*)", "axbxcc", Just ("axbxc", [(1,"axbxc"),(2,"xc")]))
-- note: pcre gives insetad (2, "xbxc") -- I don't understand why
, ("[\\p{Nd}]", "33", Just ("3", []))
, ("\\p{N}", "33", Just ("3", []))
]


Expand Down

0 comments on commit be241d7

Please sign in to comment.