module Language.Haskell.HsColour.Classify
  ( TokenType(..)
  , tokenise
  ) where

import Data.Char (isSpace, isUpper, isLower, isDigit)
import Data.List

-- | Lex Haskell source code into an annotated token stream, without
--   discarding any characters or layout.
tokenise :: String -> [(TokenType,String)]
tokenise :: String -> [(TokenType, String)]
tokenise str :: String
str = 
    let chunks :: [String]
chunks = [String] -> [String]
glue ([String] -> [String])
-> (String -> [String]) -> String -> [String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> [String]
chunk (String -> [String]) -> String -> [String]
forall a b. (a -> b) -> a -> b
$ String
str 
    in [(TokenType, String)] -> [(TokenType, String)]
markDefs ([(TokenType, String)] -> [(TokenType, String)])
-> [(TokenType, String)] -> [(TokenType, String)]
forall a b. (a -> b) -> a -> b
$ (String -> (TokenType, String))
-> [String] -> [(TokenType, String)]
forall a b. (a -> b) -> [a] -> [b]
map (\s :: String
s-> (String -> TokenType
classify String
s,String
s)) [String]
chunks

markDefs :: [(TokenType, String)] -> [(TokenType, String)]
markDefs :: [(TokenType, String)] -> [(TokenType, String)]
markDefs [] = []
markDefs ((Varid, s :: String
s) : rest :: [(TokenType, String)]
rest) = (TokenType
Definition, String
s) (TokenType, String)
-> [(TokenType, String)] -> [(TokenType, String)]
forall a. a -> [a] -> [a]
: [(TokenType, String)] -> [(TokenType, String)]
continue [(TokenType, String)]
rest
markDefs ((Varop, ">") : (Space, " ") : (Varid, d :: String
d) : rest :: [(TokenType, String)]
rest) =
    (TokenType
Varop, ">") (TokenType, String)
-> [(TokenType, String)] -> [(TokenType, String)]
forall a. a -> [a] -> [a]
: (TokenType
Space, " ") (TokenType, String)
-> [(TokenType, String)] -> [(TokenType, String)]
forall a. a -> [a] -> [a]
: (TokenType
Definition, String
d) (TokenType, String)
-> [(TokenType, String)] -> [(TokenType, String)]
forall a. a -> [a] -> [a]
: [(TokenType, String)] -> [(TokenType, String)]
continue [(TokenType, String)]
rest
markDefs rest :: [(TokenType, String)]
rest = [(TokenType, String)] -> [(TokenType, String)]
continue [(TokenType, String)]
rest

continue :: [(TokenType, String)] -> [(TokenType, String)]
continue rest :: [(TokenType, String)]
rest 
    = let (thisLine :: [(TokenType, String)]
thisLine, nextLine :: [(TokenType, String)]
nextLine) = ((TokenType, String) -> Bool)
-> [(TokenType, String)]
-> ([(TokenType, String)], [(TokenType, String)])
forall a. (a -> Bool) -> [a] -> ([a], [a])
span ((TokenType, String) -> (TokenType, String) -> Bool
forall a. Eq a => a -> a -> Bool
/= (TokenType
Space, "\n")) [(TokenType, String)]
rest
      in
        case [(TokenType, String)]
nextLine of
          [] -> [(TokenType, String)]
thisLine
          ((Space, "\n"):nextLine' :: [(TokenType, String)]
nextLine') -> ([(TokenType, String)]
thisLine [(TokenType, String)]
-> [(TokenType, String)] -> [(TokenType, String)]
forall a. [a] -> [a] -> [a]
++ ((TokenType
Space, "\n") (TokenType, String)
-> [(TokenType, String)] -> [(TokenType, String)]
forall a. a -> [a] -> [a]
: ([(TokenType, String)] -> [(TokenType, String)]
markDefs [(TokenType, String)]
nextLine')))


-- Basic Haskell lexing, except we keep whitespace.
chunk :: String -> [String]
chunk :: String -> [String]
chunk []    = []
chunk ('\r':s :: String
s) = String -> [String]
chunk String
s -- get rid of DOS newline stuff
chunk ('\n':s :: String
s) = "\n"String -> [String] -> [String]
forall a. a -> [a] -> [a]
: String -> [String]
chunk String
s
chunk (c :: Char
c:s :: String
s) | Char -> Bool
isLinearSpace Char
c
            = (Char
cChar -> String -> String
forall a. a -> [a] -> [a]
:String
ss)String -> [String] -> [String]
forall a. a -> [a] -> [a]
: String -> [String]
chunk String
rest where (ss :: String
ss,rest :: String
rest) = (Char -> Bool) -> String -> (String, String)
forall a. (a -> Bool) -> [a] -> ([a], [a])
span Char -> Bool
isLinearSpace String
s
chunk ('{':'-':s :: String
s) = let (com :: String
com,s' :: String
s') = Int -> String -> (String, String)
nestcomment 0 String
s
                    in ('{'Char -> String -> String
forall a. a -> [a] -> [a]
:'-'Char -> String -> String
forall a. a -> [a] -> [a]
:String
com) String -> [String] -> [String]
forall a. a -> [a] -> [a]
: String -> [String]
chunk String
s'
chunk s :: String
s = case ReadS String
Prelude.lex String
s of
              []             -> [String -> Char
forall a. [a] -> a
head String
s]String -> [String] -> [String]
forall a. a -> [a] -> [a]
: String -> [String]
chunk (String -> String
forall a. [a] -> [a]
tail String
s) -- e.g. inside comment
              ((tok :: String
tok@('-':'-':_),rest :: String
rest):_)
                  | (Char -> Bool) -> String -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
=='-') String
tok -> (String
tokString -> String -> String
forall a. [a] -> [a] -> [a]
++String
com)String -> [String] -> [String]
forall a. a -> [a] -> [a]
: String -> [String]
chunk String
s'
                                       where (com :: String
com,s' :: String
s') = String -> (String, String)
eolcomment String
rest
              ((tok :: String
tok,rest :: String
rest):_) -> String
tokString -> [String] -> [String]
forall a. a -> [a] -> [a]
: String -> [String]
chunk String
rest

isLinearSpace :: Char -> Bool
isLinearSpace c :: Char
c = Char
c Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` " \t\f" -- " \t\xa0"

-- Glue sequences of tokens into more useful blobs
glue :: [String] -> [String]
glue (q :: String
q:".":n :: String
n:rest :: [String]
rest) | Char -> Bool
isUpper (String -> Char
forall a. [a] -> a
head String
q)	-- qualified names
                    = [String] -> [String]
glue ((String
qString -> String -> String
forall a. [a] -> [a] -> [a]
++"."String -> String -> String
forall a. [a] -> [a] -> [a]
++String
n)String -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String]
rest)
glue ("`":rest :: [String]
rest) =				-- `varid` -> varop
  case [String] -> [String]
glue [String]
rest of
    (qn :: String
qn:"`":rest :: [String]
rest) -> ("`"String -> String -> String
forall a. [a] -> [a] -> [a]
++String
qnString -> String -> String
forall a. [a] -> [a] -> [a]
++"`")String -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
    _             -> "`"String -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
glue (s :: String
s:ss :: [String]
ss)       | (Char -> Bool) -> String -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
=='-') String
s Bool -> Bool -> Bool
&& String -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length String
s Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>=2	-- eol comment
                  = (String
sString -> String -> String
forall a. [a] -> [a] -> [a]
++[String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
c)String -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
                  where (c :: [String]
c,rest :: [String]
rest) = (String -> Bool) -> [String] -> ([String], [String])
forall a. (a -> Bool) -> [a] -> ([a], [a])
break ('\n'Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`) [String]
ss
--glue ("{":"-":ss)  = ("{-"++c): glue rest	-- nested comment
--                  where (c,rest) = nestcomment 0 ss
glue ("(":ss :: [String]
ss) = case [String]
rest of
                ")":rest :: [String]
rest -> ("(" String -> String -> String
forall a. [a] -> [a] -> [a]
++ [String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
tuple String -> String -> String
forall a. [a] -> [a] -> [a]
++ ")") String -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
                _         -> "(" String -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
ss
              where (tuple :: [String]
tuple,rest :: [String]
rest) = (String -> Bool) -> [String] -> ([String], [String])
forall a. (a -> Bool) -> [a] -> ([a], [a])
span (String -> String -> Bool
forall a. Eq a => a -> a -> Bool
==",") [String]
ss
glue ("[":"]":ss :: [String]
ss) = "[]" String -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
ss
glue ("\n":"#":ss :: [String]
ss)= "\n" String -> [String] -> [String]
forall a. a -> [a] -> [a]
: ('#'Char -> String -> String
forall a. a -> [a] -> [a]
:[String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
line) String -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
rest
                  where (line :: [String]
line,rest :: [String]
rest) = (String -> Bool) -> [String] -> ([String], [String])
forall a. (a -> Bool) -> [a] -> ([a], [a])
break ('\n'Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`) [String]
ss
glue (s :: String
s:ss :: [String]
ss)       = String
sString -> [String] -> [String]
forall a. a -> [a] -> [a]
: [String] -> [String]
glue [String]
ss
glue []           = []

-- Deal with comments.
nestcomment :: Int -> String -> (String,String)
nestcomment :: Int -> String -> (String, String)
nestcomment n :: Int
n ('{':'-':ss :: String
ss) | Int
nInt -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>=0 = (("{-"String -> String -> String
forall a. [a] -> [a] -> [a]
++String
cs),String
rm)
                                  where (cs :: String
cs,rm :: String
rm) = Int -> String -> (String, String)
nestcomment (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) String
ss
nestcomment n :: Int
n ('-':'}':ss :: String
ss) | Int
nInt -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>0  = (("-}"String -> String -> String
forall a. [a] -> [a] -> [a]
++String
cs),String
rm)
                                  where (cs :: String
cs,rm :: String
rm) = Int -> String -> (String, String)
nestcomment (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) String
ss
nestcomment n :: Int
n ('-':'}':ss :: String
ss) | Int
nInt -> Int -> Bool
forall a. Eq a => a -> a -> Bool
==0 = ("-}",String
ss)
nestcomment n :: Int
n (s :: Char
s:ss :: String
ss)       | Int
nInt -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>=0 = ((Char
sChar -> String -> String
forall a. a -> [a] -> [a]
:String
cs),String
rm)
                                  where (cs :: String
cs,rm :: String
rm) = Int -> String -> (String, String)
nestcomment Int
n String
ss
nestcomment n :: Int
n [] = ([],[])

eolcomment :: String -> (String,String)
eolcomment :: String -> (String, String)
eolcomment s :: String
s@('\n':_) = ([], String
s)
eolcomment ('\r':s :: String
s)   = String -> (String, String)
eolcomment String
s
eolcomment (c :: Char
c:s :: String
s)      = (Char
cChar -> String -> String
forall a. a -> [a] -> [a]
:String
cs, String
s') where (cs :: String
cs,s' :: String
s') = String -> (String, String)
eolcomment String
s
eolcomment []         = ([],[])

-- | Classification of tokens as lexical entities
data TokenType =
  Space | Keyword | Keyglyph | Layout | Comment | Conid | Varid |
  Conop | Varop   | String   | Char   | Number  | Cpp   | Error |
  Definition
  deriving (TokenType -> TokenType -> Bool
(TokenType -> TokenType -> Bool)
-> (TokenType -> TokenType -> Bool) -> Eq TokenType
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TokenType -> TokenType -> Bool
$c/= :: TokenType -> TokenType -> Bool
== :: TokenType -> TokenType -> Bool
$c== :: TokenType -> TokenType -> Bool
Eq,Int -> TokenType -> String -> String
[TokenType] -> String -> String
TokenType -> String
(Int -> TokenType -> String -> String)
-> (TokenType -> String)
-> ([TokenType] -> String -> String)
-> Show TokenType
forall a.
(Int -> a -> String -> String)
-> (a -> String) -> ([a] -> String -> String) -> Show a
showList :: [TokenType] -> String -> String
$cshowList :: [TokenType] -> String -> String
show :: TokenType -> String
$cshow :: TokenType -> String
showsPrec :: Int -> TokenType -> String -> String
$cshowsPrec :: Int -> TokenType -> String -> String
Show)

classify :: String -> TokenType
classify :: String -> TokenType
classify s :: String
s@(h :: Char
h:t :: String
t)
    | Char -> Bool
isSpace Char
h              = TokenType
Space
    | (Char -> Bool) -> String -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
=='-') String
s          = TokenType
Comment
    | "--" String -> String -> Bool
forall a. Eq a => [a] -> [a] -> Bool
`isPrefixOf` String
s
      Bool -> Bool -> Bool
&& (Char -> Bool) -> String -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any Char -> Bool
isSpace String
s       = TokenType
Comment		-- not fully correct
    | "{-" String -> String -> Bool
forall a. Eq a => [a] -> [a] -> Bool
`isPrefixOf` String
s    = TokenType
Comment
    | String
s String -> [String] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [String]
keywords      = TokenType
Keyword
    | String
s String -> [String] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [String]
keyglyphs     = TokenType
Keyglyph
    | String
s String -> [String] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [String]
layoutchars   = TokenType
Layout
    | Char -> Bool
isUpper Char
h              = TokenType
Conid
    | String
s String -> String -> Bool
forall a. Eq a => a -> a -> Bool
== "[]"              = TokenType
Conid
    | Char
h Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '(' Bool -> Bool -> Bool
&& String -> Bool
isTupleTail String
t = TokenType
Conid
    | Char
h Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '#'               = TokenType
Cpp
    | Char -> Bool
isLower Char
h              = TokenType
Varid
    | Char
h Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
symbols       = TokenType
Varop
    | Char
hChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==':'                 = TokenType
Conop
    | Char
hChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
=='`'                 = TokenType
Varop
    | Char
hChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
=='"'                 = TokenType
String
    | Char
hChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
=='\''                = TokenType
Char
    | Char -> Bool
isDigit Char
h              = TokenType
Number
    | Bool
otherwise              = TokenType
Error
classify _ = TokenType
Space

isTupleTail :: String -> Bool
isTupleTail [')'] = Bool
True
isTupleTail (',':xs :: String
xs) = String -> Bool
isTupleTail String
xs
isTupleTail _ = Bool
False


-- Haskell keywords
keywords :: [String]
keywords =
  ["case","class","data","default","deriving","do","else","forall"
  ,"if","import","in","infix","infixl","infixr","instance","let","module"
  ,"newtype","of","qualified","then","type","where","_"
  ,"foreign","ccall","as","safe","unsafe","family"]
keyglyphs :: [String]
keyglyphs =
  ["..","::","=","\\","|","<-","->","@","~","=>","[","]"]
layoutchars :: [String]
layoutchars =
  (Char -> String) -> String -> [String]
forall a b. (a -> b) -> [a] -> [b]
map (Char -> String -> String
forall a. a -> [a] -> [a]
:[]) ";{}(),"
symbols :: String
symbols =
  "!#$%&*+./<=>?@\\^|-~"