Lucene++ - a full-featured, c++ search engine
API Documentation


StandardTokenizer.h
Go to the documentation of this file.
1 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef STANDARDTOKENIZER_H
8 #define STANDARDTOKENIZER_H
9 
10 #include "Tokenizer.h"
11 
12 namespace Lucene {
13 
34 class LPPAPI StandardTokenizer : public Tokenizer {
35 public:
38  StandardTokenizer(LuceneVersion::Version matchVersion, const ReaderPtr& input);
39 
41  StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeSourcePtr& source, const ReaderPtr& input);
42 
44  StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeFactoryPtr& factory, const ReaderPtr& input);
45 
46  virtual ~StandardTokenizer();
47 
49 
50 protected:
53 
55  int32_t maxTokenLength;
56 
57  // this tokenizer generates three attributes: offset, positionIncrement and type
62 
63 public:
64  static const int32_t ALPHANUM;
65  static const int32_t APOSTROPHE;
66  static const int32_t ACRONYM;
67  static const int32_t COMPANY;
68  static const int32_t EMAIL;
69  static const int32_t HOST;
70  static const int32_t NUM;
71  static const int32_t CJ;
72 
74  static const int32_t ACRONYM_DEP;
75 
78 
79 protected:
80  void init(const ReaderPtr& input, LuceneVersion::Version matchVersion);
81 
82 public:
84  void setMaxTokenLength(int32_t length);
85 
87  int32_t getMaxTokenLength();
88 
90  virtual bool incrementToken();
91 
92  virtual void end();
93 
94  virtual void reset(const ReaderPtr& input);
95 
99 
102  void setReplaceInvalidAcronym(bool replaceInvalidAcronym);
103 };
104 
105 }
106 
107 #endif
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
Version
Definition: Constants.h:40
A grammar-based tokenizer.
Definition: StandardTokenizer.h:34
static const int32_t ACRONYM_DEP
Definition: StandardTokenizer.h:74
static const Collection< String > TOKEN_TYPES()
String token types that correspond to token type int constants.
virtual void end()
This method is called by the consumer after the last token has been consumed, after incrementToken() ...
StandardTokenizerImplPtr scanner
A private instance of the scanner.
Definition: StandardTokenizer.h:48
StandardTokenizer(LuceneVersion::Version matchVersion, const ReaderPtr &input)
Creates a new instance of the StandardTokenizer. Attaches the input to the newly created scanner.
static const int32_t CJ
Definition: StandardTokenizer.h:71
static const int32_t ACRONYM
Definition: StandardTokenizer.h:66
void setReplaceInvalidAcronym(bool replaceInvalidAcronym)
int32_t maxTokenLength
Definition: StandardTokenizer.h:55
void init(const ReaderPtr &input, LuceneVersion::Version matchVersion)
void setMaxTokenLength(int32_t length)
Set the max allowed token length. Any token longer than this is skipped.
bool replaceInvalidAcronym
Definition: StandardTokenizer.h:54
static const int32_t COMPANY
Definition: StandardTokenizer.h:67
static const int32_t HOST
Definition: StandardTokenizer.h:69
StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeFactoryPtr &factory, const ReaderPtr &input)
Creates a new StandardTokenizer with a given AttributeSource.AttributeFactory.
TypeAttributePtr typeAtt
Definition: StandardTokenizer.h:61
StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeSourcePtr &source, const ReaderPtr &input)
Creates a new StandardTokenizer with a given AttributeSource.
TermAttributePtr termAtt
Definition: StandardTokenizer.h:58
static const int32_t APOSTROPHE
Definition: StandardTokenizer.h:65
virtual void reset(const ReaderPtr &input)
Reset the tokenizer to a new reader. Typically, an analyzer (in its reusableTokenStream method) will ...
static const int32_t NUM
Definition: StandardTokenizer.h:70
PositionIncrementAttributePtr posIncrAtt
Definition: StandardTokenizer.h:60
OffsetAttributePtr offsetAtt
Definition: StandardTokenizer.h:59
virtual bool incrementToken()
static const int32_t ALPHANUM
Definition: StandardTokenizer.h:64
static const int32_t EMAIL
Definition: StandardTokenizer.h:68
A Tokenizer is a TokenStream whose input is a Reader.
Definition: Tokenizer.h:20
Definition: AbstractAllTermDocs.h:12
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition: LuceneTypes.h:520
boost::shared_ptr< PositionIncrementAttribute > PositionIncrementAttributePtr
Definition: LuceneTypes.h:45
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition: LuceneTypes.h:58
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition: LuceneTypes.h:40
boost::shared_ptr< StandardTokenizerImpl > StandardTokenizerImplPtr
Definition: LuceneTypes.h:53
boost::shared_ptr< Reader > ReaderPtr
Definition: LuceneTypes.h:547
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition: LuceneTypes.h:519
boost::shared_ptr< TypeAttribute > TypeAttributePtr
Definition: LuceneTypes.h:64

clucene.sourceforge.net