Skip to content

Commit

Permalink
Make WordDetect sensitive to weakDeliminator attribute.
Browse files Browse the repository at this point in the history
Closes #174. This fixes parsing of C floats beginning with "0."
  • Loading branch information
jgm committed Aug 26, 2023
1 parent a5887dc commit b3e66cf
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 15 deletions.
2 changes: 2 additions & 0 deletions skylighting-core/src/Skylighting/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ getParser casesensitive syntaxname itemdatas lists kwattr cattr el = do
let str' = getAttrValue "String" el
let insensitive = vBool (not casesensitive) $ getAttrValue "insensitive" el
let includeAttrib = vBool False $ getAttrValue "includeAttrib" el
let weakDelim = Set.fromList $ T.unpack $ getAttrValue "weakDeliminator" el
let lookahead = vBool False $ getAttrValue "lookAhead" el
let firstNonSpace = vBool False $ getAttrValue "firstNonSpace" el
let column' = getAttrValue "column" el
Expand Down Expand Up @@ -277,6 +278,7 @@ getParser casesensitive syntaxname itemdatas lists kwattr cattr el = do
then M.lookup cattr itemdatas
else M.lookup attribute itemdatas
, rIncludeAttribute = includeAttrib
, rWeakDeliminators = weakDelim
, rDynamic = dynamic
, rCaseSensitive = not insensitive
, rChildren = children
Expand Down
33 changes: 18 additions & 15 deletions skylighting-core/src/Skylighting/Tokenizer.hs
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,8 @@ tryRule rule inp = do
stringDetect (rDynamic rule) (rCaseSensitive rule)
s inp
WordDetect s -> withAttr attr $
wordDetect (rCaseSensitive rule) s inp
wordDetect (rCaseSensitive rule)
(rWeakDeliminators rule) s inp
LineContinue -> withAttr attr $ lineContinue inp
DetectSpaces -> withAttr attr $ detectSpaces inp
DetectIdentifier -> withAttr attr $ detectIdentifier inp
Expand Down Expand Up @@ -380,9 +381,9 @@ withAttr tt p = do
then return Nothing
else return $ Just (tt, res)

wordDetect :: Bool -> Text -> ByteString -> TokenizerM Text
wordDetect caseSensitive s inp = do
wordBoundary inp
wordDetect :: Bool -> Set.Set Char -> Text -> ByteString -> TokenizerM Text
wordDetect caseSensitive weakDelims s inp = do
wordBoundary weakDelims inp
t <- decodeBS $ UTF8.take (Text.length s) inp
-- we assume here that the case fold will not change length,
-- which is safe for ASCII keywords and the like...
Expand All @@ -395,7 +396,7 @@ wordDetect caseSensitive s inp = do
let d = case UTF8.uncons rest of
Nothing -> '\n'
Just (x,_) -> x
guard $ isWordBoundary c d
guard $ isWordBoundary weakDelims c d
takeChars (Text.length t)

stringDetect :: Bool -> Bool -> Text -> ByteString -> TokenizerM Text
Expand Down Expand Up @@ -549,7 +550,7 @@ regExpr :: Bool -> RE -> ByteString -> TokenizerM Text
regExpr dynamic re inp = do
-- return $! traceShowId $! (reStr, inp)
let reStr = reString re
when (BS.take 2 reStr == "\\b") $ wordBoundary inp
when (BS.take 2 reStr == "\\b") $ wordBoundary mempty inp
regex <- case compileRE re of
Right r -> return r
Left e -> throwError $
Expand All @@ -569,16 +570,18 @@ regExpr dynamic re inp = do
toSlice :: ByteString -> (Int, Int) -> ByteString
toSlice bs (off, len) = BS.take len $ BS.drop off bs

wordBoundary :: ByteString -> TokenizerM ()
wordBoundary inp = do
wordBoundary :: Set.Set Char -> ByteString -> TokenizerM ()
wordBoundary weakDelims inp = do
case UTF8.uncons inp of
Nothing -> return ()
Just (d, _) -> do
c <- gets prevChar
guard $ isWordBoundary c d
guard $ isWordBoundary weakDelims c d

isWordBoundary :: Char -> Char -> Bool
isWordBoundary c d = isWordChar c /= isWordChar d
isWordBoundary :: Set.Set Char -> Char -> Char -> Bool
isWordBoundary weakDelims c d =
(isWordChar c || c `Set.member` weakDelims) /=
(isWordChar d || d `Set.member` weakDelims)

decodeBS :: ByteString -> TokenizerM Text
decodeBS bs = case decodeUtf8' bs of
Expand Down Expand Up @@ -658,7 +661,7 @@ pCChar = do

parseInt :: ByteString -> TokenizerM Text
parseInt inp = do
wordBoundary inp
wordBoundary mempty inp
case A.parseOnly (A.match (pHex <|> pOct <|> pDec)) inp of
Left _ -> mzero
Right (r,_) -> takeChars (BS.length r) -- assumes ascii
Expand All @@ -670,7 +673,7 @@ pDec = do

parseOct :: ByteString -> TokenizerM Text
parseOct inp = do
wordBoundary inp
wordBoundary mempty inp
case A.parseOnly (A.match pHex) inp of
Left _ -> mzero
Right (r,_) -> takeChars (BS.length r) -- assumes ascii
Expand All @@ -685,7 +688,7 @@ pOct = do

parseHex :: ByteString -> TokenizerM Text
parseHex inp = do
wordBoundary inp
wordBoundary mempty inp
case A.parseOnly (A.match pHex) inp of
Left _ -> mzero
Right (r,_) -> takeChars (BS.length r) -- assumes ascii
Expand All @@ -706,7 +709,7 @@ mbPlusMinus = () <$ A.satisfy (A.inClass "+-") <|> return ()

parseFloat :: ByteString -> TokenizerM Text
parseFloat inp = do
wordBoundary inp
wordBoundary mempty inp
case A.parseOnly (A.match pFloat) inp of
Left _ -> mzero
Right (r,_) -> takeChars (BS.length r) -- assumes all ascii
Expand Down
1 change: 1 addition & 0 deletions skylighting-core/src/Skylighting/Types.hs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ data Rule = Rule{
rMatcher :: !Matcher
, rAttribute :: !TokenType
, rIncludeAttribute :: !Bool
, rWeakDeliminators :: Set.Set Char
, rDynamic :: !Bool
, rCaseSensitive :: !Bool
, rChildren :: ![Rule]
Expand Down
10 changes: 10 additions & 0 deletions skylighting-core/test/test-skylighting.hs
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,16 @@ main = do
@=? tokenize defConfig bash
"f() {\n echo > f\n}\n"

, testCase "C floating-point literal (#174)" $ Right
[ [ ( DataTypeTok , "double")
, ( NormalTok , " x " )
, ( OperatorTok , "=" )
, ( NormalTok , " " )
, ( FloatTok , "0.5")
, ( OperatorTok , ";" ) ] ]
@=? tokenize defConfig c
"double x = 0.5;\n"

]
]

Expand Down

0 comments on commit b3e66cf

Please sign in to comment.