{-# LANGUAGE DeriveDataTypeable #-}
module Data.CharSet.Unicode.Category
(
Category(..)
, categories
, lookupCategory
, lookupCategoryCharSet
, modifierLetter, otherLetter, letter
, lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd
, nonSpacingMark, spacingCombiningMark, enclosingMark, mark
, space, lineSeparator, paragraphSeparator, separator
, mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol
, decimalNumber, letterNumber, otherNumber, number
, dashPunctuation, openPunctuation, closePunctuation, initialQuote
, finalQuote, connectorPunctuation, otherPunctuation, punctuation
, control, format, privateUse, surrogate, notAssigned, other
) where
import Data.Char
import Data.CharSet
import Data.Data
import Data.HashMap.Lazy (HashMap)
import qualified Data.HashMap.Lazy as HashMap
data Category = Category
{ Category -> String
categoryName :: String
, Category -> String
categoryAbbreviation :: String
, Category -> CharSet
categoryCharSet :: CharSet
, Category -> String
categoryDescription :: String
} deriving (Int -> Category -> ShowS
[Category] -> ShowS
Category -> String
(Int -> Category -> ShowS)
-> (Category -> String) -> ([Category] -> ShowS) -> Show Category
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Category] -> ShowS
$cshowList :: [Category] -> ShowS
show :: Category -> String
$cshow :: Category -> String
showsPrec :: Int -> Category -> ShowS
$cshowsPrec :: Int -> Category -> ShowS
Show, Typeable Category
DataType
Constr
Typeable Category =>
(forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Category -> c Category)
-> (forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Category)
-> (Category -> Constr)
-> (Category -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Category))
-> (forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Category))
-> ((forall b. Data b => b -> b) -> Category -> Category)
-> (forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r)
-> (forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r)
-> (forall u. (forall d. Data d => d -> u) -> Category -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> Category -> u)
-> (forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Category -> m Category)
-> (forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category)
-> (forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category)
-> Data Category
Category -> DataType
Category -> Constr
(forall b. Data b => b -> b) -> Category -> Category
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Category -> c Category
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Category
forall a.
Typeable a =>
(forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> a -> c a)
-> (forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c a)
-> (a -> Constr)
-> (a -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c a))
-> (forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c a))
-> ((forall b. Data b => b -> b) -> a -> a)
-> (forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall u. (forall d. Data d => d -> u) -> a -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> a -> u)
-> (forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> Data a
forall u. Int -> (forall d. Data d => d -> u) -> Category -> u
forall u. (forall d. Data d => d -> u) -> Category -> [u]
forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Category -> m Category
forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category
forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Category
forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Category -> c Category
forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Category)
forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Category)
$cCategory :: Constr
$tCategory :: DataType
gmapMo :: (forall d. Data d => d -> m d) -> Category -> m Category
$cgmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category
gmapMp :: (forall d. Data d => d -> m d) -> Category -> m Category
$cgmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Category -> m Category
gmapM :: (forall d. Data d => d -> m d) -> Category -> m Category
$cgmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Category -> m Category
gmapQi :: Int -> (forall d. Data d => d -> u) -> Category -> u
$cgmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Category -> u
gmapQ :: (forall d. Data d => d -> u) -> Category -> [u]
$cgmapQ :: forall u. (forall d. Data d => d -> u) -> Category -> [u]
gmapQr :: (r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
$cgmapQr :: forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
gmapQl :: (r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
$cgmapQl :: forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> Category -> r
gmapT :: (forall b. Data b => b -> b) -> Category -> Category
$cgmapT :: (forall b. Data b => b -> b) -> Category -> Category
dataCast2 :: (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Category)
$cdataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Category)
dataCast1 :: (forall d. Data d => c (t d)) -> Maybe (c Category)
$cdataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Category)
dataTypeOf :: Category -> DataType
$cdataTypeOf :: Category -> DataType
toConstr :: Category -> Constr
$ctoConstr :: Category -> Constr
gunfold :: (forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Category
$cgunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Category
gfoldl :: (forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Category -> c Category
$cgfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Category -> c Category
$cp1Data :: Typeable Category
Data, Typeable)
categories :: [Category]
categories :: [Category]
categories =
[ String -> String -> CharSet -> String -> Category
Category "Letter" "L" CharSet
letter "any kind of letter from any language."
, String -> String -> CharSet -> String -> Category
Category "Lowercase_Letter" "Ll" CharSet
lowercaseLetter "a lowercase letter that has an uppercase variant"
, String -> String -> CharSet -> String -> Category
Category "Uppercase_Letter" "Lu" CharSet
uppercaseLetter "an uppercase letter that has a lowercase variant"
, String -> String -> CharSet -> String -> Category
Category "Titlecase_Letter" "Lt" CharSet
titlecaseLetter "a letter that appears at the start of a word when only the first letter of the word is capitalized"
, String -> String -> CharSet -> String -> Category
Category "Letter&" "L&" CharSet
letterAnd "a letter that exists in lowercase and uppercase variants (combination of Ll, Lu and Lt)"
, String -> String -> CharSet -> String -> Category
Category "Modifier_Letter" "Lm" CharSet
modifierLetter "a special character that is used like a letter"
, String -> String -> CharSet -> String -> Category
Category "Other_Letter" "Lo" CharSet
otherLetter "a letter or ideograph that does not have lowercase and uppercase variants"
, String -> String -> CharSet -> String -> Category
Category "Mark" "M" CharSet
mark "a character intended to be combined with another character (e.g. accents, umlauts, enclosing boxes, etc.)"
, String -> String -> CharSet -> String -> Category
Category "Non_Spacing_Mark" "Mn" CharSet
nonSpacingMark "a character intended to be combined with another character without taking up extra space (e.g. accents, umlauts, etc.)"
, String -> String -> CharSet -> String -> Category
Category "Spacing_Combining_Mark" "Mc" CharSet
spacingCombiningMark "a character intended to be combined with another character that takes up extra space (vowel signs in many Eastern languages)"
, String -> String -> CharSet -> String -> Category
Category "Enclosing_Mark" "Me" CharSet
enclosingMark "a character that encloses the character is is combined with (circle, square, keycap, etc.)"
, String -> String -> CharSet -> String -> Category
Category "Separator" "Z" CharSet
separator "any kind of whitespace or invisible separator"
, String -> String -> CharSet -> String -> Category
Category "Space_Separator" "Zs" CharSet
space "a whitespace character that is invisible, but does take up space"
, String -> String -> CharSet -> String -> Category
Category "Line_Separator" "Zl" CharSet
lineSeparator "line separator character U+2028"
, String -> String -> CharSet -> String -> Category
Category "Paragraph_Separator" "Zp" CharSet
paragraphSeparator "paragraph separator character U+2029"
, String -> String -> CharSet -> String -> Category
Category "Symbol" "S" CharSet
symbol "math symbols, currency signs, dingbats, box-drawing characters, etc."
, String -> String -> CharSet -> String -> Category
Category "Math_Symbol" "Sm" CharSet
mathSymbol "any mathematical symbol"
, String -> String -> CharSet -> String -> Category
Category "Currency_Symbol" "Sc" CharSet
currencySymbol "any currency sign"
, String -> String -> CharSet -> String -> Category
Category "Modifier_Symbol" "Sk" CharSet
modifierSymbol "a combining character (mark) as a full character on its own"
, String -> String -> CharSet -> String -> Category
Category "Other_Symbol" "So" CharSet
otherSymbol "various symbols that are not math symbols, currency signs, or combining characters"
, String -> String -> CharSet -> String -> Category
Category "Number" "N" CharSet
number "any kind of numeric character in any script"
, String -> String -> CharSet -> String -> Category
Category "Decimal_Digit_Number" "Nd" CharSet
decimalNumber "a digit zero through nine in any script except ideographic scripts"
, String -> String -> CharSet -> String -> Category
Category "Letter_Number" "Nl" CharSet
letterNumber "a number that looks like a letter, such as a Roman numeral"
, String -> String -> CharSet -> String -> Category
Category "Other_Number" "No" CharSet
otherNumber "a superscript or subscript digit, or a number that is not a digit 0..9 (excluding numbers from ideographic scripts)"
, String -> String -> CharSet -> String -> Category
Category "Punctuation" "P" CharSet
punctuation "any kind of punctuation character"
, String -> String -> CharSet -> String -> Category
Category "Dash_Punctuation" "Pd" CharSet
dashPunctuation "any kind of hyphen or dash"
, String -> String -> CharSet -> String -> Category
Category "Open_Punctuation" "Ps" CharSet
openPunctuation "any kind of opening bracket"
, String -> String -> CharSet -> String -> Category
Category "Close_Punctuation" "Pe" CharSet
closePunctuation "any kind of closing bracket"
, String -> String -> CharSet -> String -> Category
Category "Initial_Punctuation" "Pi" CharSet
initialQuote "any kind of opening quote"
, String -> String -> CharSet -> String -> Category
Category "Final_Punctuation" "Pf" CharSet
finalQuote "any kind of closing quote"
, String -> String -> CharSet -> String -> Category
Category "Connector_Punctuation" "Pc" CharSet
connectorPunctuation "a punctuation character such as an underscore that connects words"
, String -> String -> CharSet -> String -> Category
Category "Other_Punctuation" "Po" CharSet
otherPunctuation "any kind of punctuation character that is not a dash, bracket, quote or connector"
, String -> String -> CharSet -> String -> Category
Category "Other" "C" CharSet
other "invisible control characters and unused code points"
, String -> String -> CharSet -> String -> Category
Category "Control" "Cc" CharSet
control "an ASCII 0x00..0x1F or Latin-1 0x80..0x9F control character"
, String -> String -> CharSet -> String -> Category
Category "Format" "Cf" CharSet
format "invisible formatting indicator"
, String -> String -> CharSet -> String -> Category
Category "Private_Use" "Co" CharSet
privateUse "any code point reserved for private use"
, String -> String -> CharSet -> String -> Category
Category "Surrogate" "Cs" CharSet
surrogate "one half of a surrogate pair in UTF-16 encoding"
, String -> String -> CharSet -> String -> Category
Category "Unassigned" "Cn" CharSet
notAssigned "any code point to which no character has been assigned.properties" ]
lookupTable :: HashMap String Category
lookupTable :: HashMap String Category
lookupTable = [(String, Category)] -> HashMap String Category
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HashMap.fromList
[ (ShowS
canonicalize String
x, Category
category)
| category :: Category
category@(Category l :: String
l s :: String
s _ _) <- [Category]
categories
, String
x <- [String
l,String
s]
]
lookupCategory :: String -> Maybe Category
lookupCategory :: String -> Maybe Category
lookupCategory s :: String
s = String -> HashMap String Category -> Maybe Category
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup (ShowS
canonicalize String
s) HashMap String Category
lookupTable
lookupCategoryCharSet :: String -> Maybe CharSet
lookupCategoryCharSet :: String -> Maybe CharSet
lookupCategoryCharSet = (Category -> CharSet) -> Maybe Category -> Maybe CharSet
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Category -> CharSet
categoryCharSet (Maybe Category -> Maybe CharSet)
-> (String -> Maybe Category) -> String -> Maybe CharSet
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Maybe Category
lookupCategory
canonicalize :: String -> String
canonicalize :: ShowS
canonicalize s :: String
s = case (Char -> Char) -> ShowS
forall a b. (a -> b) -> [a] -> [b]
Prelude.map Char -> Char
toLower String
s of
'i' : 's' : xs :: String
xs -> ShowS
go String
xs
xs :: String
xs -> ShowS
go String
xs
where
go :: ShowS
go ('-':xs :: String
xs) = ShowS
go String
xs
go ('_':xs :: String
xs) = ShowS
go String
xs
go (' ':xs :: String
xs) = ShowS
go String
xs
go (x :: Char
x:xs :: String
xs) = Char
x Char -> ShowS
forall a. a -> [a] -> [a]
: ShowS
go String
xs
go [] = []
cat :: GeneralCategory -> CharSet
cat :: GeneralCategory -> CharSet
cat category :: GeneralCategory
category = (Char -> Bool) -> CharSet
build ((GeneralCategory
category GeneralCategory -> GeneralCategory -> Bool
forall a. Eq a => a -> a -> Bool
==) (GeneralCategory -> Bool)
-> (Char -> GeneralCategory) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> GeneralCategory
generalCategory)
lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd, modifierLetter, otherLetter, letter :: CharSet
lowercaseLetter :: CharSet
lowercaseLetter = GeneralCategory -> CharSet
cat GeneralCategory
LowercaseLetter
uppercaseLetter :: CharSet
uppercaseLetter = GeneralCategory -> CharSet
cat GeneralCategory
UppercaseLetter
titlecaseLetter :: CharSet
titlecaseLetter = GeneralCategory -> CharSet
cat GeneralCategory
TitlecaseLetter
letterAnd :: CharSet
letterAnd = CharSet
lowercaseLetter
CharSet -> CharSet -> CharSet
`union` CharSet
uppercaseLetter
CharSet -> CharSet -> CharSet
`union` CharSet
titlecaseLetter
modifierLetter :: CharSet
modifierLetter = GeneralCategory -> CharSet
cat GeneralCategory
ModifierLetter
otherLetter :: CharSet
otherLetter = GeneralCategory -> CharSet
cat GeneralCategory
OtherLetter
letter :: CharSet
letter
= CharSet
letterAnd
CharSet -> CharSet -> CharSet
`union` CharSet
modifierLetter
CharSet -> CharSet -> CharSet
`union` CharSet
otherLetter
nonSpacingMark, spacingCombiningMark, enclosingMark, mark :: CharSet
nonSpacingMark :: CharSet
nonSpacingMark = GeneralCategory -> CharSet
cat GeneralCategory
NonSpacingMark
spacingCombiningMark :: CharSet
spacingCombiningMark = GeneralCategory -> CharSet
cat GeneralCategory
SpacingCombiningMark
enclosingMark :: CharSet
enclosingMark = GeneralCategory -> CharSet
cat GeneralCategory
EnclosingMark
mark :: CharSet
mark
= CharSet
nonSpacingMark
CharSet -> CharSet -> CharSet
`union` CharSet
spacingCombiningMark
CharSet -> CharSet -> CharSet
`union` CharSet
enclosingMark
space, lineSeparator, paragraphSeparator, separator :: CharSet
space :: CharSet
space = GeneralCategory -> CharSet
cat GeneralCategory
Space
lineSeparator :: CharSet
lineSeparator = GeneralCategory -> CharSet
cat GeneralCategory
LineSeparator
paragraphSeparator :: CharSet
paragraphSeparator = GeneralCategory -> CharSet
cat GeneralCategory
ParagraphSeparator
separator :: CharSet
separator
= CharSet
space
CharSet -> CharSet -> CharSet
`union` CharSet
lineSeparator
CharSet -> CharSet -> CharSet
`union` CharSet
paragraphSeparator
mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol :: CharSet
mathSymbol :: CharSet
mathSymbol = GeneralCategory -> CharSet
cat GeneralCategory
MathSymbol
currencySymbol :: CharSet
currencySymbol = GeneralCategory -> CharSet
cat GeneralCategory
CurrencySymbol
modifierSymbol :: CharSet
modifierSymbol = GeneralCategory -> CharSet
cat GeneralCategory
ModifierSymbol
otherSymbol :: CharSet
otherSymbol = GeneralCategory -> CharSet
cat GeneralCategory
OtherSymbol
symbol :: CharSet
symbol
= CharSet
mathSymbol
CharSet -> CharSet -> CharSet
`union` CharSet
currencySymbol
CharSet -> CharSet -> CharSet
`union` CharSet
modifierSymbol
CharSet -> CharSet -> CharSet
`union` CharSet
otherSymbol
decimalNumber, letterNumber, otherNumber, number :: CharSet
decimalNumber :: CharSet
decimalNumber = GeneralCategory -> CharSet
cat GeneralCategory
DecimalNumber
letterNumber :: CharSet
letterNumber = GeneralCategory -> CharSet
cat GeneralCategory
LetterNumber
otherNumber :: CharSet
otherNumber = GeneralCategory -> CharSet
cat GeneralCategory
OtherNumber
number :: CharSet
number
= CharSet
decimalNumber
CharSet -> CharSet -> CharSet
`union` CharSet
letterNumber
CharSet -> CharSet -> CharSet
`union` CharSet
otherNumber
dashPunctuation, openPunctuation, closePunctuation, initialQuote,
finalQuote, connectorPunctuation, otherPunctuation, punctuation :: CharSet
dashPunctuation :: CharSet
dashPunctuation = GeneralCategory -> CharSet
cat GeneralCategory
DashPunctuation
openPunctuation :: CharSet
openPunctuation = GeneralCategory -> CharSet
cat GeneralCategory
OpenPunctuation
closePunctuation :: CharSet
closePunctuation = GeneralCategory -> CharSet
cat GeneralCategory
ClosePunctuation
initialQuote :: CharSet
initialQuote = GeneralCategory -> CharSet
cat GeneralCategory
InitialQuote
finalQuote :: CharSet
finalQuote = GeneralCategory -> CharSet
cat GeneralCategory
FinalQuote
connectorPunctuation :: CharSet
connectorPunctuation = GeneralCategory -> CharSet
cat GeneralCategory
ConnectorPunctuation
otherPunctuation :: CharSet
otherPunctuation = GeneralCategory -> CharSet
cat GeneralCategory
OtherPunctuation
punctuation :: CharSet
punctuation
= CharSet
dashPunctuation
CharSet -> CharSet -> CharSet
`union` CharSet
openPunctuation
CharSet -> CharSet -> CharSet
`union` CharSet
closePunctuation
CharSet -> CharSet -> CharSet
`union` CharSet
initialQuote
CharSet -> CharSet -> CharSet
`union` CharSet
finalQuote
CharSet -> CharSet -> CharSet
`union` CharSet
connectorPunctuation
CharSet -> CharSet -> CharSet
`union` CharSet
otherPunctuation
control, format, privateUse, surrogate, notAssigned, other :: CharSet
control :: CharSet
control = GeneralCategory -> CharSet
cat GeneralCategory
Control
format :: CharSet
format = GeneralCategory -> CharSet
cat GeneralCategory
Format
privateUse :: CharSet
privateUse = GeneralCategory -> CharSet
cat GeneralCategory
PrivateUse
surrogate :: CharSet
surrogate = GeneralCategory -> CharSet
cat GeneralCategory
Surrogate
notAssigned :: CharSet
notAssigned = GeneralCategory -> CharSet
cat GeneralCategory
NotAssigned
other :: CharSet
other = CharSet
control
CharSet -> CharSet -> CharSet
`union` CharSet
format
CharSet -> CharSet -> CharSet
`union` CharSet
privateUse
CharSet -> CharSet -> CharSet
`union` CharSet
surrogate
CharSet -> CharSet -> CharSet
`union` CharSet
notAssigned