diff --git a/bin/xls2csv.hs b/bin/xls2csv.hs index 990f4e4..7ec43b1 100755 --- a/bin/xls2csv.hs +++ b/bin/xls2csv.hs @@ -2,7 +2,7 @@ -- stack --resolver lts runhaskell --package getopt-generics import Data.List (intercalate) -import Data.Xls (decodeXlsIO) +import Data.Xls (decodeXlsIO,cellToString) import WithCli (withCli) -- TODO need to escape the separator and the escaping quotes themselves @@ -10,7 +10,7 @@ import WithCli (withCli) xlsToCSV :: String -> IO () xlsToCSV file = do worksheets <- decodeXlsIO file - mapM_ (mapM_ (putStrLn . intercalate ",")) worksheets + mapM_ (mapM_ (putStrLn . intercalate "," . fmap cellToString)) worksheets main :: IO () main = withCli xlsToCSV diff --git a/lib/Data/Xls.hs b/lib/Data/Xls.hs index 6aeafd6..7c9188f 100644 --- a/lib/Data/Xls.hs +++ b/lib/Data/Xls.hs @@ -20,23 +20,38 @@ #endif module Data.Xls - ( decodeXlsIO + ( -- * Cell data type + CellF(..) + , Cell + , cellToString + -- * decoding Xls files + , decodeXlsIO + , decodeXlsByteString + , decodeXlsByteString' , decodeXls , XlsException(..) + , XLSError(..) ) where -import Control.Exception (Exception, throwIO, bracket) +import Control.Exception (Exception, throwIO, bracket, catch) import Control.Monad.IO.Class -import Control.Monad (when, void) +import Control.Monad (void) import Control.Monad.Trans.Resource import Data.Conduit hiding (Conduit, Sink, Source) import Data.Data import Data.Int -import Data.Maybe (catMaybes, fromJust, isJust, fromMaybe) +import Data.Word (Word32) +import Data.Maybe (catMaybes, fromJust, isJust) +import Data.ByteString (hPut) +import Data.ByteString.Internal (ByteString(..)) +import Data.XlsCell (CellF(..),Cell,cellToString) import Foreign.C import Foreign.Ptr -import Text.Printf +import Foreign.ForeignPtr (withForeignPtr) +import Foreign.Storable (Storable(..)) +import Foreign.Marshal.Alloc (malloc) +import System.IO.Temp (withSystemTempFile) #define CCALL(name,signature) \ foreign import ccall unsafe #name \ @@ -44,9 +59,45 @@ foreign import ccall unsafe #name \ -- Workbook accessor functions data XLSWorkbookStruct +-- | An enum returned by libxls. See libxls\/include\/xls.h +data XLSError = LIBXLS_OK + | LIBXLS_ERROR_OPEN + | LIBXLS_ERROR_SEEK + | LIBXLS_ERROR_READ + | LIBXLS_ERROR_PARSE + | LIBXLS_ERROR_MALLOC + deriving (Show,Eq,Enum) +instance Storable XLSError where + sizeOf = const (sizeOf (0 :: Word32)) -- TODO: sizeof enum is compiler and architecture dependent! + alignment = sizeOf -- okay for simple Storables + peek = fmap (toEnum . (fromIntegral :: Word32 -> Int)) . peek . castPtr + poke ptr e = poke (castPtr ptr) ((fromIntegral :: Int -> Word32).fromEnum $ e) +instance Exception XLSError where + type XLSWorkbook = Ptr XLSWorkbookStruct +type XLSErrorT = Ptr XLSError +type CBuffer = Ptr CUChar + +-- | Recall that +-- @ +-- ByteString ~ (ForeignPtr Char8,Int) +-- CUChar ~ Word8 +-- CSize ~ Word64 +-- @ +-- +-- So we need to marshal +-- +-- @ +-- (ForeignPtr Char8) -> Ptr CUChar +-- Int -> CSize +-- @ +toCBuffer :: ByteString -> IO (CBuffer,CSize) +toCBuffer (PS fPtr offset ilen) = do + withForeignPtr fPtr $ \ptrData -> do + return (plusPtr (castPtr ptrData) offset,CSize (fromIntegral ilen)) CCALL(xls_open, CString -> CString -> IO XLSWorkbook) +CCALL(xls_open_buffer, CBuffer -> CSize -> CString -> XLSErrorT -> IO XLSWorkbook) CCALL(xls_wb_sheetcount, XLSWorkbook -> IO CInt -- Int32) CCALL(xls_close_WB, XLSWorkbook -> IO ()) @@ -82,6 +133,13 @@ data XlsException = instance Exception XlsException +exceptionLeft :: XlsException -> Either XLSError a +exceptionLeft (XlsFileNotFound _) = Left LIBXLS_ERROR_OPEN +exceptionLeft (XlsParseError _) = Left LIBXLS_ERROR_PARSE + +catchXls :: IO a -> IO (Either XLSError a) +catchXls = flip catch (return.exceptionLeft) . fmap Right + -- | Parse a Microsoft excel xls workbook file into a Conduit yielding -- rows in a worksheet. Each row represented by a list of Strings, each String -- representing an individual cell. @@ -111,6 +169,27 @@ decodeXls file = count <- liftIO $ c_xls_wb_sheetcount pWB mapM_ (decodeOneWorkSheet file pWB) [0 .. count - 1] +-- | A work-around via temporary files and 'decodeXlsIO'. +-- Since this library lacks a pure function to decode from a buffer, +-- we just write the buffer to a temporary file and decode the file. +-- Due to Erik Rybakken. +decodeXlsByteString :: ByteString -> IO [[[Cell]]] +decodeXlsByteString content = withSystemTempFile "decodeXlsByteString" + $ \filePath h -> do + hPut h content + decodeXlsIO filePath + +-- | Experimental: This function uses the @xls_open_buffer@ function of libxls. +decodeXlsByteString' :: ByteString -> IO (Either XLSError [[[Cell]]]) +decodeXlsByteString' bs = do + (buf,buflen) <- toCBuffer bs + enc <- newCString "UTF-8" + outError <- malloc + wb <- c_xls_open_buffer buf buflen enc outError + e <- peek outError + case e of + LIBXLS_OK -> decodeXLSWorkbook Nothing wb + _ -> return (Left e) -- | Parse a Microsoft excel xls workbook file into a list of worksheets, each -- worksheet consists of a list of rows and each row consists of a list of @@ -120,17 +199,29 @@ decodeXls file = -- decodeXlsIO :: FilePath - -> IO [[[String]]] + -> IO [[[Cell]]] decodeXlsIO file = do file' <- newCString file pWB <- newCString "UTF-8" >>= c_xls_open file' - when (pWB == nullPtr) $ - throwIO $ XlsFileNotFound - $ "XLS file " ++ file ++ " not found." - count <- liftIO $ c_xls_wb_sheetcount pWB - results <- mapM (decodeOneWorkSheetIO file pWB) [0 .. count - 1] - void $ c_xls_close_WB pWB - return results + parseResult <- decodeXLSWorkbook (Just file) pWB + case parseResult of + Right results -> return results + Left e -> case e of + LIBXLS_ERROR_OPEN -> throwIO $ XlsFileNotFound $ + "XLS file " ++ file ++ " not found." + _ -> throwIO $ XlsParseError $ + "XLS file " ++ file ++ " could not be parsed." + +-- helper function for decoding both file and buffer +decodeXLSWorkbook :: Maybe FilePath -> XLSWorkbook -> IO (Either XLSError [[[Cell]]]) +decodeXLSWorkbook mFile pWB = if pWB == nullPtr + then return (Left LIBXLS_ERROR_OPEN) + else catchXls $ do + count <- liftIO $ c_xls_wb_sheetcount pWB + results <- mapM (decodeOneWorkSheetIO (maybe "buffer" id mFile) pWB) [0 .. count - 1] + void $ c_xls_close_WB pWB + return results + decodeOneWorkSheet :: MonadResource m @@ -155,7 +246,7 @@ decodeOneWorkSheetIO :: FilePath -> XLSWorkbook -> CInt - -> IO [[String]] + -> IO [[Cell]] decodeOneWorkSheetIO file pWB index = bracket alloc cleanup decodeRowsIO where @@ -179,7 +270,7 @@ decodeRows pWS = do decodeRowsIO :: XLSWorksheet - -> IO [[String]] + -> IO [[Cell]] decodeRowsIO pWS = do rows <- c_xls_ws_rowcount pWS cols <- c_xls_ws_colcount pWS @@ -197,22 +288,27 @@ decodeOneRowIO :: XLSWorksheet -> Int16 -> Int16 - -> IO [String] + -> IO [Cell] decodeOneRowIO pWS cols rowindex = mapM (c_xls_cell pWS rowindex) [0 .. cols - 1] - >>= mapM decodeOneCell - >>= pure . (map $ fromMaybe "") + >>= mapM decodeOneCell' data CellType = Numerical | Formula | Str | Other decodeOneCell :: XLSCell -> IO (Maybe String) -decodeOneCell cellPtr = do +decodeOneCell = fmap maybeString . decodeOneCell' where + maybeString (OtherCell _) = Nothing + maybeString c = Just (cellToString c) + +decodeOneCell' :: XLSCell -> IO Cell +decodeOneCell' cellPtr = do nil <- isNullCell cellPtr if nil then - return Nothing - else cellValue cellPtr >>= return . Just + return (OtherCell ()) + else cellValue cellPtr where + emptyCell = OtherCell () isNullCell ptr = if ptr == nullPtr then return True @@ -237,21 +333,19 @@ decodeOneCell cellPtr = do return Nothing return $ case cellType typ ftype strval of - Numerical -> outputNum numval + Numerical -> let (CDouble d) = numval in NumericalCell d Formula -> decodeFormula strval numval - Str -> fromJust strval - Other -> "" -- we don't decode anything else + Str -> (TextCell . fromJust) strval + Other -> emptyCell -- we don't decode anything else decodeFormula str numval = case str of Just "bool" -> outputBool numval - Just "error" -> "*error*" - Just x -> x - Nothing -> "" -- is it possible? + Just "error" -> TextCell "*error*" + Just x -> TextCell x + Nothing -> emptyCell -- is it possible? - outputNum d = printf "%.15g" (uncurry encodeFloat (decodeFloat d) - :: Double) - outputBool d = if d == 0 then "false" else "true" + outputBool d = BoolCell (if d == 0 then False else True) cellType t ftype strval = if t == 0x27e || t == 0x0BD || t == 0x203 then diff --git a/lib/Data/XlsCell.hs b/lib/Data/XlsCell.hs new file mode 100644 index 0000000..d980ef2 --- /dev/null +++ b/lib/Data/XlsCell.hs @@ -0,0 +1,35 @@ +-- | +-- Module : Data.XlsCell +-- Copyright : (c) 2022 Olaf.Klinke +-- +-- License : BSD-style +-- Maintainer : olaf.klinke@phymetric.de +-- Stability : experimental +-- Portability : GHC +-- +-- Static Excel cell values +-- +{-# LANGUAGE DeriveFunctor, FlexibleInstances #-} +module Data.XlsCell (CellF(..),Cell,cellToString) where +import Data.String (IsString(..)) +import Text.Printf (printf) + +-- | extensible 'Cell' type +data CellF o = NumericalCell Double + | TextCell String + | BoolCell Bool + | OtherCell o + deriving (Functor,Show,Eq) +instance IsString (CellF o) where + fromString = TextCell + +-- | static 'Cell's in Excel files can hold +-- numbers, test or booleans. +type Cell = CellF () + +-- | convert to 'String'. Not the inverse of 'fromString'! +cellToString :: Cell -> String +cellToString (NumericalCell d) = printf "%.15g" d +cellToString (TextCell txt) = txt +cellToString (BoolCell b) = if b then "True" else "False" +cellToString (OtherCell _) = "" diff --git a/test/Spec.hs b/test/Spec.hs index b644fb7..5f5ba22 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -7,5 +7,8 @@ main = hspec $ describe "Sanity check" $ do content <- decodeXlsIO "test/data/test.xls" content `shouldBe` testFileContent -testFileContent :: [[[String]]] -testFileContent = [[["1.000000000000000","2.3","text"]],[["1.000000000000000","2.3","text"]]] +testFileContent :: [[[Cell]]] +testFileContent = [ + [[NumericalCell 1.0,TextCell "2.3",TextCell "text"]], + [[NumericalCell 1.0,TextCell "2.3",TextCell "text"]] + ] diff --git a/xls.cabal b/xls.cabal index 0acf162..5208525 100644 --- a/xls.cabal +++ b/xls.cabal @@ -1,5 +1,5 @@ name: xls -version: 0.1.3 +version: 0.1.4 synopsis: Parse Microsoft Excel xls files (BIFF/Excel 97-2004) description: Parse Microsoft Excel spreadsheet files in @.xls@ file format @@ -49,10 +49,13 @@ library hs-source-dirs: lib exposed-modules: Data.Xls + other-modules: Data.XlsCell build-depends: base >= 4.7 && < 5 + , bytestring >= 0.10 , conduit >= 1.1 && < 1.4 , filepath >= 1.0 && < 1.5 , resourcet >= 0.3 && < 1.3 + , temporary , transformers >= 0.1 && < 0.6 c-sources: lib/libxls-wrapper.c,