00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef __UNIBROW_H__
00029 #define __UNIBROW_H__
00030
00031 #include <sys/types.h>
00032
00038 namespace unibrow {
00039
00040 typedef unsigned int uchar;
00041 typedef unsigned char byte;
00042
00047 static const int kMaxCaseConvertedSize = 3;
00048
00049 template <class T, int size = 256>
00050 class Predicate {
00051 public:
00052 inline Predicate() { }
00053 inline bool get(uchar c);
00054 private:
00055 friend class Test;
00056 bool CalculateValue(uchar c);
00057 struct CacheEntry {
00058 inline CacheEntry() : code_point_(0), value_(0) { }
00059 inline CacheEntry(uchar code_point, bool value)
00060 : code_point_(code_point),
00061 value_(value) { }
00062 uchar code_point_ : 21;
00063 bool value_ : 1;
00064 };
00065 static const int kSize = size;
00066 static const int kMask = kSize - 1;
00067 CacheEntry entries_[kSize];
00068 };
00069
00070
00071
00072
00073
00074 template <class T, int size = 256>
00075 class Mapping {
00076 public:
00077 inline Mapping() { }
00078 inline int get(uchar c, uchar n, uchar* result);
00079 private:
00080 friend class Test;
00081 int CalculateValue(uchar c, uchar n, uchar* result);
00082 struct CacheEntry {
00083 inline CacheEntry() : code_point_(0), offset_(0) { }
00084 inline CacheEntry(uchar code_point, signed offset)
00085 : code_point_(code_point),
00086 offset_(offset) { }
00087 uchar code_point_ : 21;
00088 signed offset_ : 11;
00089 };
00090 static const int kSize = size;
00091 static const int kMask = kSize - 1;
00092 CacheEntry entries_[kSize];
00093 };
00094
00095 class UnicodeData {
00096 private:
00097 friend class Test;
00098 static int GetByteCount();
00099 static uchar kMaxCodePoint;
00100 };
00101
00102
00103
00104 template <typename Data>
00105 class Buffer {
00106 public:
00107 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
00108 inline Buffer() : data_(0), length_(0) { }
00109 Data data() { return data_; }
00110 unsigned length() { return length_; }
00111 private:
00112 Data data_;
00113 unsigned length_;
00114 };
00115
00116 class Utf8 {
00117 public:
00118 static inline uchar Length(uchar chr);
00119 static inline unsigned Encode(char* out, uchar c);
00120 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
00121 unsigned capacity, unsigned* chars_read, unsigned* offset);
00122 static const uchar kBadChar = 0xFFFD;
00123 static const unsigned kMaxEncodedSize = 4;
00124 static const unsigned kMaxOneByteChar = 0x7f;
00125 static const unsigned kMaxTwoByteChar = 0x7ff;
00126 static const unsigned kMaxThreeByteChar = 0xffff;
00127 static const unsigned kMaxFourByteChar = 0x1fffff;
00128
00129 private:
00130 template <unsigned s> friend class Utf8InputBuffer;
00131 friend class Test;
00132 static inline uchar ValueOf(const byte* str,
00133 unsigned length,
00134 unsigned* cursor);
00135 static uchar CalculateValue(const byte* str,
00136 unsigned length,
00137 unsigned* cursor);
00138 };
00139
00140
00141
00142 class CharacterStream {
00143 public:
00144 inline uchar GetNext();
00145 inline bool has_more() { return remaining_ != 0; }
00146
00147 virtual void Seek(unsigned);
00148 unsigned Length();
00149 virtual ~CharacterStream() { }
00150 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
00151 unsigned& offset);
00152 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
00153 unsigned capacity, unsigned& offset);
00154 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
00155 unsigned capacity, unsigned& offset);
00156 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
00157 virtual void Rewind() = 0;
00158 protected:
00159 virtual void FillBuffer() = 0;
00160
00161 unsigned remaining_;
00162
00163 unsigned cursor_;
00164
00165 const byte* buffer_;
00166 };
00167
00168
00169
00176 template <class Reader, class Input = Reader*, unsigned kSize = 256>
00177 class InputBuffer : public CharacterStream {
00178 public:
00179 virtual void Rewind();
00180 inline void Reset(Input input);
00181 void Seek(unsigned position);
00182 inline void Reset(unsigned position, Input input);
00183 protected:
00184 InputBuffer() { }
00185 explicit InputBuffer(Input input) { Reset(input); }
00186 virtual void FillBuffer();
00187
00188
00189
00190 unsigned offset_;
00191
00192 Input input_;
00193
00194
00195
00196
00197 byte util_buffer_[kSize];
00198 };
00199
00200
00201
00202 template <unsigned s = 256>
00203 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
00204 public:
00205 inline Utf8InputBuffer() { }
00206 inline Utf8InputBuffer(const char* data, unsigned length);
00207 inline void Reset(const char* data, unsigned length) {
00208 InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
00209 Buffer<const char*>(data, length));
00210 }
00211 };
00212
00213 struct Uppercase {
00214 static bool Is(uchar c);
00215 };
00216 struct Lowercase {
00217 static bool Is(uchar c);
00218 };
00219 struct Letter {
00220 static bool Is(uchar c);
00221 };
00222 struct Space {
00223 static bool Is(uchar c);
00224 };
00225 struct Titlecase {
00226 static bool Is(uchar c);
00227 };
00228 struct Number {
00229 static bool Is(uchar c);
00230 };
00231 struct DecimalDigit {
00232 static bool Is(uchar c);
00233 };
00234 struct Ideographic {
00235 static bool Is(uchar c);
00236 };
00237 struct WhiteSpace {
00238 static bool Is(uchar c);
00239 };
00240 struct HexDigit {
00241 static bool Is(uchar c);
00242 };
00243 struct AsciiHexDigit {
00244 static bool Is(uchar c);
00245 };
00246 struct BidiControl {
00247 static bool Is(uchar c);
00248 };
00249 struct JoinControl {
00250 static bool Is(uchar c);
00251 };
00252 struct Dash {
00253 static bool Is(uchar c);
00254 };
00255 struct Hyphen {
00256 static bool Is(uchar c);
00257 };
00258 struct LineTerminator {
00259 static bool Is(uchar c);
00260 };
00261 struct RegExpSpecialChar {
00262 static bool Is(uchar c);
00263 };
00264 struct CombiningMark {
00265 static bool Is(uchar c);
00266 };
00267 struct ConnectorPunctuation {
00268 static bool Is(uchar c);
00269 };
00270 struct ToLowercase {
00271 static int Convert(uchar c,
00272 uchar n,
00273 uchar* result,
00274 bool* allow_caching_ptr);
00275 };
00276 struct ToUppercase {
00277 static int Convert(uchar c,
00278 uchar n,
00279 uchar* result,
00280 bool* allow_caching_ptr);
00281 };
00282
00283 }
00284
00285 #endif // __UNIBROW_H__