{-# LANGUAGE CPP #-}
-----------------------------------------------------------------------------
-- |
-- Copyright   :  (c) 2006-2007 Duncan Coutts
-- License     :  BSD-style
--
-- Maintainer  :  duncan@haskell.org
-- Portability :  portable (H98 + FFI)
--
-- String encoding conversion
--
-----------------------------------------------------------------------------
module Codec.Text.IConv (

  -- | This module provides pure functions for converting the string encoding
  -- of strings represented by lazy 'ByteString's. This makes it easy to use
  -- either in memory or with disk or network IO.
  --
  -- For example, a simple Latin1 to UTF-8 conversion program is just:
  --
  -- > import Codec.Text.IConv as IConv
  -- > import Data.ByteString.Lazy as ByteString
  -- >
  -- > main = ByteString.interact (convert "LATIN1" "UTF-8")
  --
  -- Or you could lazily read in and convert a UTF-8 file to UTF-32 using:
  --
  -- > content <- fmap (IConv.convert "UTF-8" "UTF-32") (readFile file)
  --
  -- This module uses the POSIX @iconv()@ library function. The primary
  -- advantage of using iconv is that it is widely available, most systems
  -- have a wide range of supported string encodings and the conversion speed
  -- it typically good. The iconv library is available on all unix systems
  -- (since it is required by the POSIX.1 standard) and GNU libiconv is
  -- available as a standalone library for other systems, including Windows.

  -- * Simple conversion API
  convert,
  EncodingName,

  -- * Variant that is lax about conversion errors
  convertFuzzy,
  Fuzzy(..),

  -- * Variants that are pedantic about conversion errors
  convertStrictly,
  convertLazily,
  ConversionError(..),
  reportConversionError,
  Span(..),

  ) where

import Prelude hiding (length, span)

import Control.Exception (assert)
import qualified Control.Exception as Exception
import Foreign.C.Error as C.Error (Errno, errnoToIOError)

import qualified Data.ByteString.Lazy as L (ByteString, toChunks, fromChunks)
import qualified Data.ByteString.Lazy.Internal as L (defaultChunkSize)
import qualified Data.ByteString as S

import qualified Codec.Text.IConv.Internal as IConv
import Codec.Text.IConv.Internal (IConv)


-- | A string encoding name, eg @\"UTF-8\"@ or @\"LATIN1\"@.
--
-- The range of string encodings available is determined by the capabilities
-- of the underlying iconv implementation.
--
-- When using the GNU C or libiconv libraries, the permitted values are listed
-- by the @iconv --list@ command, and all combinations of the listed values
-- are supported.
--
type EncodingName = String

-- | Output spans from encoding conversion. When nothing goes wrong we
-- expect just a bunch of 'Span's. If there are conversion errors we get other
-- span types.
--
data Span =

    -- | An ordinary output span in the target encoding
    Span !S.ByteString

    -- | An error in the conversion process. If this occurs it will be the
    -- last span.
  | ConversionError !ConversionError

data ConversionError =
    -- | The conversion from the input to output string encoding is not
    -- supported by the underlying iconv implementation. This is usually
    -- because a named encoding is not recognised or support for it
    -- was not enabled on this system.
    --
    -- The POSIX standard does not guarantee that all possible combinations
    -- of recognised string encoding are supported, however most common
    -- implementations do support all possible combinations.
    --
    UnsuportedConversion EncodingName EncodingName

    -- | This covers two possible conversion errors:
    --
    -- * There is a byte sequence in the input that is not valid in the input
    -- encoding.
    --
    -- * There is a valid character in the input that has no corresponding
    -- character in the output encoding.
    --
    -- Unfortunately iconv does not let us distinguish these two cases. In
    -- either case, the Int parameter gives the byte offset in the input of
    -- the unrecognised bytes or unconvertable character.
    --
  | InvalidChar Int

    -- | This error covers the case where the end of the input has trailing
    -- bytes that are the initial bytes of a valid character in the input
    -- encoding. In other words, it looks like the input ended in the middle of
    -- a multi-byte character. This would often be an indication that the input
    -- was somehow truncated. Again, the Int parameter is the byte offset in
    -- the input where the incomplete character starts.
    --
  | IncompleteChar Int

    -- | An unexpected iconv error. The iconv spec lists a number of possible
    -- expected errors but does not guarantee that there might not be other
    -- errors.
    --
    -- This error can occur either immediately, which might indicate that the
    -- iconv installation is messed up somehow, or it could occur later which
    -- might indicate resource exhaustion or some other internal iconv error.
    --
    -- Use 'Foreign.C.Error.errnoToIOError' to get slightly more information
    -- on what the error could possibly be.
    --
  | UnexpectedError C.Error.Errno

reportConversionError :: ConversionError -> IOError
reportConversionError :: ConversionError -> IOError
reportConversionError conversionError :: ConversionError
conversionError = case ConversionError
conversionError of
  UnsuportedConversion fromEncoding :: EncodingName
fromEncoding toEncoding :: EncodingName
toEncoding
                          -> EncodingName -> IOError
err (EncodingName -> IOError) -> EncodingName -> IOError
forall a b. (a -> b) -> a -> b
$ "cannot convert from string encoding "
                             EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName -> EncodingName
forall a. Show a => a -> EncodingName
show EncodingName
fromEncoding EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ " to string encoding "
                             EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName -> EncodingName
forall a. Show a => a -> EncodingName
show EncodingName
toEncoding
  InvalidChar    inputPos :: Int
inputPos -> EncodingName -> IOError
err (EncodingName -> IOError) -> EncodingName -> IOError
forall a b. (a -> b) -> a -> b
$ "invalid input sequence at byte offset "
                               EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ Int -> EncodingName
forall a. Show a => a -> EncodingName
show Int
inputPos
  IncompleteChar inputPos :: Int
inputPos -> EncodingName -> IOError
err (EncodingName -> IOError) -> EncodingName -> IOError
forall a b. (a -> b) -> a -> b
$ "incomplete input sequence at byte offset "
                               EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ Int -> EncodingName
forall a. Show a => a -> EncodingName
show Int
inputPos
  UnexpectedError errno :: Errno
errno   -> EncodingName
-> Errno -> Maybe Handle -> Maybe EncodingName -> IOError
C.Error.errnoToIOError
                               "Codec.Text.IConv: unexpected error" Errno
errno
                               Maybe Handle
forall a. Maybe a
Nothing Maybe EncodingName
forall a. Maybe a
Nothing
  where err :: EncodingName -> IOError
err msg :: EncodingName
msg = EncodingName -> IOError
userError (EncodingName -> IOError) -> EncodingName -> IOError
forall a b. (a -> b) -> a -> b
$ "Codec.Text.IConv: " EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName
msg

{-# NOINLINE convert #-}
-- | Convert text from one named string encoding to another.
--
-- * The conversion is done lazily.
--
-- * An exception is thrown if conversion between the two encodings is not
-- supported.
--
-- * An exception is thrown if there are any encoding conversion errors.
--
convert :: EncodingName      -- ^ Name of input string encoding
        -> EncodingName      -- ^ Name of output string encoding
        -> L.ByteString      -- ^ Input text
        -> L.ByteString      -- ^ Output text
convert :: EncodingName -> EncodingName -> ByteString -> ByteString
convert fromEncoding :: EncodingName
fromEncoding toEncoding :: EncodingName
toEncoding =

    -- lazily convert the list of spans into an ordinary lazy ByteString:
    [ByteString] -> ByteString
L.fromChunks
  ([ByteString] -> ByteString)
-> (ByteString -> [ByteString]) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Span -> [ByteString] -> [ByteString])
-> [ByteString] -> [Span] -> [ByteString]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Span -> [ByteString] -> [ByteString]
span []
  ([Span] -> [ByteString])
-> (ByteString -> [Span]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EncodingName -> EncodingName -> ByteString -> [Span]
convertLazily EncodingName
fromEncoding EncodingName
toEncoding

  where
    span :: Span -> [ByteString] -> [ByteString]
span (Span c :: ByteString
c)            cs :: [ByteString]
cs = ByteString
c ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: [ByteString]
cs
    span (ConversionError e :: ConversionError
e) _  =
#if MIN_VERSION_base(4,0,0)
      IOError -> [ByteString]
forall a e. Exception e => e -> a
Exception.throw (ConversionError -> IOError
reportConversionError ConversionError
e)
#else
      Exception.throw (Exception.IOException (reportConversionError e))
#endif


data Fuzzy = Transliterate | Discard

-- | Convert text ignoring encoding conversion problems.
--
-- If invalid byte sequences are found in the input they are ignored and
-- conversion continues if possible. This is not always possible especially
-- with stateful encodings. No placeholder character is inserted into the
-- output so there will be no indication that invalid byte sequences were
-- encountered.
--
-- If there are characters in the input that have no direct corresponding
-- character in the output encoding then they are dealt in one of two ways,
-- depending on the 'Fuzzy' argument. We can try and 'Transliterate' them into
-- the nearest corresponding character(s) or use a replacement character
-- (typically @\'?\'@ or the Unicode replacement character). Alternatively they
-- can simply be 'Discard'ed.
--
-- In either case, no exceptions will occur. In the case of unrecoverable
-- errors, the output will simply be truncated. This includes the case of
-- unrecognised or unsupported encoding names; the output will be empty.
--
-- * This function only works with the GNU iconv implementation which provides
-- this feature beyond what is required by the iconv specification.
--
convertFuzzy :: Fuzzy           -- ^ Whether to try and transliterate or
                                -- discard characters with no direct conversion
             -> EncodingName    -- ^ Name of input string encoding
             -> EncodingName    -- ^ Name of output string encoding
             -> L.ByteString    -- ^ Input text
             -> L.ByteString    -- ^ Output text
convertFuzzy :: Fuzzy -> EncodingName -> EncodingName -> ByteString -> ByteString
convertFuzzy fuzzy :: Fuzzy
fuzzy fromEncoding :: EncodingName
fromEncoding toEncoding :: EncodingName
toEncoding =

    -- lazily convert the list of spans into an ordinary lazy ByteString:
    [ByteString] -> ByteString
L.fromChunks
  ([ByteString] -> ByteString)
-> (ByteString -> [ByteString]) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Span -> [ByteString] -> [ByteString])
-> [ByteString] -> [Span] -> [ByteString]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Span -> [ByteString] -> [ByteString]
span []
  ([Span] -> [ByteString])
-> (ByteString -> [Span]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. InvalidCharBehaviour
-> EncodingName -> EncodingName -> ByteString -> [Span]
convertInternal InvalidCharBehaviour
IgnoreInvalidChar EncodingName
fromEncoding (EncodingName
toEncoding EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName
mode)
  where
    mode :: EncodingName
mode = case Fuzzy
fuzzy of
             Transliterate -> "//IGNORE,TRANSLIT"
             Discard       -> "//IGNORE"
    span :: Span -> [ByteString] -> [ByteString]
span (Span c :: ByteString
c)            cs :: [ByteString]
cs = ByteString
c ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: [ByteString]
cs
    span (ConversionError _) cs :: [ByteString]
cs = [ByteString]
cs

{-# NOINLINE convertStrictly #-}
-- | This variant does the conversion all in one go, so it is able to report
-- any conversion errors up front. It exposes all the possible error conditions
-- and never throws exceptions
--
-- The disadvantage is that no output can be produced before the whole input
-- is consumed. This might be problematic for very large inputs.
--
convertStrictly :: EncodingName      -- ^ Name of input string encoding
                -> EncodingName      -- ^ Name of output string encoding
                -> L.ByteString      -- ^ Input text
                -> Either L.ByteString
                          ConversionError -- ^ Output text or conversion error
convertStrictly :: EncodingName
-> EncodingName -> ByteString -> Either ByteString ConversionError
convertStrictly fromEncoding :: EncodingName
fromEncoding toEncoding :: EncodingName
toEncoding =
    -- strictly convert the list of spans into an ordinary lazy ByteString
    -- or an error
    [ByteString] -> [Span] -> Either ByteString ConversionError
strictify []
  ([Span] -> Either ByteString ConversionError)
-> (ByteString -> [Span])
-> ByteString
-> Either ByteString ConversionError
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EncodingName -> EncodingName -> ByteString -> [Span]
convertLazily EncodingName
fromEncoding EncodingName
toEncoding

  where
    strictify :: [S.ByteString] -> [Span] -> Either L.ByteString ConversionError
    strictify :: [ByteString] -> [Span] -> Either ByteString ConversionError
strictify cs :: [ByteString]
cs []                    = ByteString -> Either ByteString ConversionError
forall a b. a -> Either a b
Left ([ByteString] -> ByteString
L.fromChunks ([ByteString] -> [ByteString]
forall a. [a] -> [a]
reverse [ByteString]
cs))
    strictify cs :: [ByteString]
cs (Span c :: ByteString
c : ss :: [Span]
ss)         = [ByteString] -> [Span] -> Either ByteString ConversionError
strictify (ByteString
cByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
cs) [Span]
ss
    strictify _  (ConversionError e :: ConversionError
e:_) = ConversionError -> Either ByteString ConversionError
forall a b. b -> Either a b
Right ConversionError
e


{-# NOINLINE convertLazily #-}
-- | This version provides a more complete but less convenient conversion
-- interface. It exposes all the possible error conditions and never throws
-- exceptions.
--
-- The conversion is still lazy. It returns a list of spans, where a span may
-- be an ordinary span of output text or a conversion error. This somewhat
-- complex interface allows both for lazy conversion and for precise reporting
-- of conversion problems. The other functions 'convert' and 'convertStrictly'
-- are actually simple wrappers on this function.
--
convertLazily :: EncodingName  -- ^ Name of input string encoding
              -> EncodingName  -- ^ Name of output string encoding
              -> L.ByteString  -- ^ Input text
              -> [Span]        -- ^ Output text spans
convertLazily :: EncodingName -> EncodingName -> ByteString -> [Span]
convertLazily = InvalidCharBehaviour
-> EncodingName -> EncodingName -> ByteString -> [Span]
convertInternal InvalidCharBehaviour
StopOnInvalidChar


data InvalidCharBehaviour = StopOnInvalidChar | IgnoreInvalidChar

convertInternal :: InvalidCharBehaviour
                -> EncodingName -> EncodingName
                -> L.ByteString -> [Span]
convertInternal :: InvalidCharBehaviour
-> EncodingName -> EncodingName -> ByteString -> [Span]
convertInternal ignore :: InvalidCharBehaviour
ignore fromEncoding :: EncodingName
fromEncoding toEncoding :: EncodingName
toEncoding input :: ByteString
input =
  EncodingName
-> EncodingName -> (InitStatus -> IConv [Span]) -> [Span]
forall a.
EncodingName -> EncodingName -> (InitStatus -> IConv a) -> a
IConv.run EncodingName
fromEncoding EncodingName
toEncoding ((InitStatus -> IConv [Span]) -> [Span])
-> (InitStatus -> IConv [Span]) -> [Span]
forall a b. (a -> b) -> a -> b
$ \status :: InitStatus
status -> case InitStatus
status of
    IConv.InitOk -> do Int -> IConv ()
IConv.newOutputBuffer Int
outChunkSize
                       InvalidCharBehaviour -> [ByteString] -> IConv [Span]
fillInputBuffer InvalidCharBehaviour
ignore (ByteString -> [ByteString]
L.toChunks ByteString
input)

    IConv.UnsupportedConversion     -> ConversionError -> IConv [Span]
failConversion (EncodingName -> EncodingName -> ConversionError
UnsuportedConversion
                                                         EncodingName
fromEncoding
                                                         EncodingName
toEncoding)
    IConv.UnexpectedInitError errno :: Errno
errno -> ConversionError -> IConv [Span]
failConversion (Errno -> ConversionError
UnexpectedError Errno
errno)


fillInputBuffer :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
fillInputBuffer :: InvalidCharBehaviour -> [ByteString] -> IConv [Span]
fillInputBuffer ignore :: InvalidCharBehaviour
ignore (inChunk :: ByteString
inChunk : inChunks :: [ByteString]
inChunks) = do
  ByteString -> IConv ()
IConv.pushInputBuffer ByteString
inChunk
  InvalidCharBehaviour -> [ByteString] -> IConv [Span]
drainBuffers InvalidCharBehaviour
ignore [ByteString]
inChunks

fillInputBuffer _ignore :: InvalidCharBehaviour
_ignore [] = do
  Int
outputBufferBytesAvailable <- IConv Int
IConv.outputBufferBytesAvailable
  IConv ()
IConv.finalise
  if Int
outputBufferBytesAvailable Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0
    then do ByteString
outChunk <- IConv ByteString
IConv.popOutputBuffer
            [Span] -> IConv [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return [ByteString -> Span
Span ByteString
outChunk]
    else [Span] -> IConv [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return []


drainBuffers :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
drainBuffers :: InvalidCharBehaviour -> [ByteString] -> IConv [Span]
drainBuffers ignore :: InvalidCharBehaviour
ignore inChunks :: [ByteString]
inChunks = do
  Bool
inputBufferEmpty_ <- IConv Bool
IConv.inputBufferEmpty
  Bool
outputBufferFull <- IConv Bool
IConv.outputBufferFull
  Bool -> IConv () -> IConv ()
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Bool -> Bool
not Bool
outputBufferFull Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
inputBufferEmpty_) (IConv () -> IConv ()) -> IConv () -> IConv ()
forall a b. (a -> b) -> a -> b
$ () -> IConv ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
  -- this invariant guarantees we can always make forward progress

  Status
status <- IConv Status
IConv.iconv

  case Status
status of
    IConv.InputEmpty -> do
      Bool
inputBufferEmpty <- IConv Bool
IConv.inputBufferEmpty
      Bool -> IConv [Span] -> IConv [Span]
forall a. (?callStack::CallStack) => Bool -> a -> a
assert Bool
inputBufferEmpty (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$ InvalidCharBehaviour -> [ByteString] -> IConv [Span]
fillInputBuffer InvalidCharBehaviour
ignore [ByteString]
inChunks

    IConv.OutputFull -> do
      ByteString
outChunk <- IConv ByteString
IConv.popOutputBuffer
      [Span]
outChunks <- IConv [Span] -> IConv [Span]
forall a. IConv a -> IConv a
IConv.unsafeInterleave (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$ do
        Int -> IConv ()
IConv.newOutputBuffer Int
outChunkSize
        InvalidCharBehaviour -> [ByteString] -> IConv [Span]
drainBuffers InvalidCharBehaviour
ignore [ByteString]
inChunks
      [Span] -> IConv [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> Span
Span ByteString
outChunk Span -> [Span] -> [Span]
forall a. a -> [a] -> [a]
: [Span]
outChunks)

    IConv.InvalidChar -> InvalidCharBehaviour -> [ByteString] -> IConv [Span]
invalidChar InvalidCharBehaviour
ignore [ByteString]
inChunks

    IConv.IncompleteChar -> InvalidCharBehaviour -> [ByteString] -> IConv [Span]
fixupBoundary InvalidCharBehaviour
ignore [ByteString]
inChunks

    IConv.UnexpectedError errno :: Errno
errno -> ConversionError -> IConv [Span]
failConversion (Errno -> ConversionError
UnexpectedError Errno
errno)

-- | The posix iconv api looks like it's designed specifically for streaming
-- and it is, except for one really really annoying corner case...
--
-- Suppose you're converting a stream, say by reading a file in 4k chunks. This
-- would seem to be the canonical use case for iconv, reading and converting an
-- input file. However suppose the 4k read chunk happens to split a multi-byte
-- character. Then iconv will stop just before that char and tell us that its
-- an incomplete char. So far so good. Now what we'd like to do is have iconv
-- remember those last few bytes in its conversion state so we can carry on
-- with the next 4k block. Sadly it does not. It requires us to fix things up
-- so that it can carry on with the next block starting with a complete multi-
-- byte character. Do do that we have to somehow copy those few trailing bytes
-- to the beginning of the next block. That's perhaps not too bad in an
-- imperitive context using a mutable input buffer - we'd just copy the few
-- trailing bytes to the beginning of the buffer and do a short read (ie 4k-n
-- the number of trailing bytes). That's not terribly nice since it means the
-- OS has to do IO on non-page aligned buffers which tends to be slower. It's
-- worse for us though since we're not using a mutable input buffer, we're
-- using a lazy bytestring which is a sequence of immutable buffers.
--
-- So we have to do more cunning things. We could just prepend the trailing
-- bytes to the next block, but that would mean alocating and copying the whole
-- next block just to prepend a couple bytes. This probably happens quite
-- frequently so would be pretty slow. So we have to be even more cunning.
--
-- The solution is to create a very small buffer to cover the few bytes making
-- up the character spanning the block boundary. So we copy the trailing bytes
-- plus a few from the beginning of the next block. Then we run iconv again on
-- that small buffer. How many bytes from the next block to copy is a slightly
-- tricky issue. If we copy too few there's no guarantee that we have enough to
-- give a complete character. We opt for a maximum size of 16, 'tmpChunkSize'
-- on the theory that no encoding in existance uses that many bytes to encode a
-- single character, so it ought to be enough. Yeah, it's a tad dodgey.
--
-- Having papered over the block boundary, we still have to cross the boundary
-- of this small buffer. It looks like we've still got the same problem,
-- however this time we should have crossed over into bytes that are wholly
-- part of the large following block so we can abandon our small temp buffer
-- an continue with the following block, with a slight offset for the few bytes
-- taken up by the chars that fit into the small buffer.
--
-- So yeah, pretty complex. Check out the proof below of the tricky case.
--
fixupBoundary :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
fixupBoundary :: InvalidCharBehaviour -> [ByteString] -> IConv [Span]
fixupBoundary _ignore :: InvalidCharBehaviour
_ignore [] = do
  Int
inputPos <- IConv Int
IConv.inputPosition
  ConversionError -> IConv [Span]
failConversion (Int -> ConversionError
IncompleteChar Int
inputPos)
fixupBoundary ignore :: InvalidCharBehaviour
ignore inChunks :: [ByteString]
inChunks@(inChunk :: ByteString
inChunk : inChunks' :: [ByteString]
inChunks') = do
  Int
inSize <- IConv Int
IConv.inputBufferSize
  Bool -> IConv () -> IConv ()
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
inSize Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
tmpChunkSize) (IConv () -> IConv ()) -> IConv () -> IConv ()
forall a b. (a -> b) -> a -> b
$ () -> IConv ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
  let extraBytes :: Int
extraBytes = Int
tmpChunkSize Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
inSize

  if ByteString -> Int
S.length ByteString
inChunk Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
extraBytes
    then do
      (ByteString -> ByteString) -> IConv ()
IConv.replaceInputBuffer (ByteString -> ByteString -> ByteString
`S.append` ByteString
inChunk)
      InvalidCharBehaviour -> [ByteString] -> IConv [Span]
drainBuffers InvalidCharBehaviour
ignore [ByteString]
inChunks'
    else do
      (ByteString -> ByteString) -> IConv ()
IConv.replaceInputBuffer (ByteString -> ByteString -> ByteString
`S.append` Int -> ByteString -> ByteString
S.take Int
extraBytes ByteString
inChunk)

      Int
before <- IConv Int
IConv.inputBufferSize
      Bool -> IConv () -> IConv ()
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
before Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
tmpChunkSize) (IConv () -> IConv ()) -> IConv () -> IConv ()
forall a b. (a -> b) -> a -> b
$ () -> IConv ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()

      Status
status <- IConv Status
IConv.iconv
      Int
after <- IConv Int
IConv.inputBufferSize
      let consumed :: Int
consumed = Int
before Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
after

      case Status
status of
        IConv.InputEmpty ->
          Bool -> IConv [Span] -> IConv [Span]
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
consumed Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
tmpChunkSize) (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$
          InvalidCharBehaviour -> [ByteString] -> IConv [Span]
fillInputBuffer InvalidCharBehaviour
ignore (Int -> ByteString -> ByteString
S.drop Int
extraBytes ByteString
inChunk ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: [ByteString]
inChunks')

        IConv.OutputFull -> do
          ByteString
outChunk <- IConv ByteString
IConv.popOutputBuffer
          [Span]
outChunks <- IConv [Span] -> IConv [Span]
forall a. IConv a -> IConv a
IConv.unsafeInterleave (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$ do
            Int -> IConv ()
IConv.newOutputBuffer Int
outChunkSize
            InvalidCharBehaviour -> [ByteString] -> IConv [Span]
drainBuffers InvalidCharBehaviour
ignore [ByteString]
inChunks
          [Span] -> IConv [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> Span
Span ByteString
outChunk Span -> [Span] -> [Span]
forall a. a -> [a] -> [a]
: [Span]
outChunks)

        IConv.InvalidChar -> InvalidCharBehaviour -> [ByteString] -> IConv [Span]
invalidChar InvalidCharBehaviour
ignore [ByteString]
inChunks

        IConv.IncompleteChar ->
          Bool -> IConv [Span] -> IConv [Span]
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
inSize Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
consumed Bool -> Bool -> Bool
&& Int
consumed Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
tmpChunkSize) (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$
          --    inSize < consumed < tmpChunkSize
          -- => { subtract inSize from each side }
          --    0 < consumed - inSize < tmpChunkSize - inSize
          -- => { by definition that extraBytes = tmpChunkSize - inSize }
          --    0 < consumed - inSize < extraBytes
          -- => { since we're in the False case of the if, we know:
          --        not (S.length inChunk <= extraBytes)
          --      =      S.length inChunk > extraBytes
          --      =      extraBytes < S.length inChunk }
          --    0 < consumed - inSize < extraBytes < S.length inChunk
          --
          -- And we're done! We know it's safe to drop (consumed - inSize) from
          -- inChunk since it's more than 0 and less than the inChunk size, so
          -- we're not being left with an empty chunk (which is not allowed).

          InvalidCharBehaviour -> [ByteString] -> IConv [Span]
drainBuffers InvalidCharBehaviour
ignore (Int -> ByteString -> ByteString
S.drop (Int
consumed Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
inSize) ByteString
inChunk ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: [ByteString]
inChunks')

        IConv.UnexpectedError errno :: Errno
errno -> ConversionError -> IConv [Span]
failConversion (Errno -> ConversionError
UnexpectedError Errno
errno)


invalidChar :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
invalidChar :: InvalidCharBehaviour -> [ByteString] -> IConv [Span]
invalidChar StopOnInvalidChar _ = do
  Int
inputPos <- IConv Int
IConv.inputPosition
  ConversionError -> IConv [Span]
failConversion (Int -> ConversionError
InvalidChar Int
inputPos)

invalidChar IgnoreInvalidChar inChunks :: [ByteString]
inChunks = do
  Int
inputPos  <- IConv Int
IConv.inputPosition
  let invalidCharError :: Span
invalidCharError = ConversionError -> Span
ConversionError (Int -> ConversionError
InvalidChar Int
inputPos)
  Int
outputBufferBytesAvailable <- IConv Int
IConv.outputBufferBytesAvailable
  if Int
outputBufferBytesAvailable Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0
    then do ByteString
outChunk  <- IConv ByteString
IConv.popOutputBuffer
            [Span]
outChunks <- IConv [Span] -> IConv [Span]
forall a. IConv a -> IConv a
IConv.unsafeInterleave (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$ do
              Int -> IConv ()
IConv.newOutputBuffer Int
outChunkSize
              Bool
inputBufferEmpty <- IConv Bool
IConv.inputBufferEmpty
              if Bool
inputBufferEmpty
                then InvalidCharBehaviour -> [ByteString] -> IConv [Span]
fillInputBuffer InvalidCharBehaviour
IgnoreInvalidChar [ByteString]
inChunks
                else InvalidCharBehaviour -> [ByteString] -> IConv [Span]
drainBuffers    InvalidCharBehaviour
IgnoreInvalidChar [ByteString]
inChunks
            [Span] -> IConv [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> Span
Span ByteString
outChunk Span -> [Span] -> [Span]
forall a. a -> [a] -> [a]
: Span
invalidCharError Span -> [Span] -> [Span]
forall a. a -> [a] -> [a]
: [Span]
outChunks)
    else do [Span]
outChunks <- IConv [Span] -> IConv [Span]
forall a. IConv a -> IConv a
IConv.unsafeInterleave (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$ do
              Int -> IConv ()
IConv.newOutputBuffer Int
outChunkSize
              Bool
inputBufferEmpty <- IConv Bool
IConv.inputBufferEmpty
              if Bool
inputBufferEmpty
                then InvalidCharBehaviour -> [ByteString] -> IConv [Span]
fillInputBuffer InvalidCharBehaviour
IgnoreInvalidChar [ByteString]
inChunks
                else InvalidCharBehaviour -> [ByteString] -> IConv [Span]
drainBuffers    InvalidCharBehaviour
IgnoreInvalidChar [ByteString]
inChunks
            [Span] -> IConv [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return (Span
invalidCharError Span -> [Span] -> [Span]
forall a. a -> [a] -> [a]
: [Span]
outChunks)

failConversion :: ConversionError -> IConv [Span]
failConversion :: ConversionError -> IConv [Span]
failConversion err :: ConversionError
err = do
  Int
outputBufferBytesAvailable <- IConv Int
IConv.outputBufferBytesAvailable
  IConv ()
IConv.finalise
  if Int
outputBufferBytesAvailable Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0
    then do ByteString
outChunk <- IConv ByteString
IConv.popOutputBuffer
            [Span] -> IConv [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return [ByteString -> Span
Span ByteString
outChunk, ConversionError -> Span
ConversionError ConversionError
err]
    else    [Span] -> IConv [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return [               ConversionError -> Span
ConversionError ConversionError
err]

outChunkSize :: Int
outChunkSize :: Int
outChunkSize = Int
L.defaultChunkSize

tmpChunkSize :: Int
tmpChunkSize :: Int
tmpChunkSize = 16