00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef __UNIBROW_INL_H__
00029 #define __UNIBROW_INL_H__
00030
00031 #include "unicode.h"
00032
00033 namespace unibrow {
00034
00035 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
00036 CacheEntry entry = entries_[code_point & kMask];
00037 if (entry.code_point_ == code_point) return entry.value_;
00038 return CalculateValue(code_point);
00039 }
00040
00041 template <class T, int s> bool Predicate<T, s>::CalculateValue(
00042 uchar code_point) {
00043 bool result = T::Is(code_point);
00044 entries_[code_point & kMask] = CacheEntry(code_point, result);
00045 return result;
00046 }
00047
00048 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
00049 uchar* result) {
00050 CacheEntry entry = entries_[c & kMask];
00051 if (entry.code_point_ == c) {
00052 if (entry.offset_ == 0) {
00053 return 0;
00054 } else {
00055 result[0] = c + entry.offset_;
00056 return 1;
00057 }
00058 } else {
00059 return CalculateValue(c, n, result);
00060 }
00061 }
00062
00063 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
00064 uchar* result) {
00065 bool allow_caching = true;
00066 int length = T::Convert(c, n, result, &allow_caching);
00067 if (allow_caching) {
00068 if (length == 1) {
00069 entries_[c & kMask] = CacheEntry(c, result[0] - c);
00070 return 1;
00071 } else {
00072 entries_[c & kMask] = CacheEntry(c, 0);
00073 return 0;
00074 }
00075 } else {
00076 return length;
00077 }
00078 }
00079
00080
00081 unsigned Utf8::Encode(char* str, uchar c) {
00082 static const int kMask = ~(1 << 6);
00083 if (c <= kMaxOneByteChar) {
00084 str[0] = c;
00085 return 1;
00086 } else if (c <= kMaxTwoByteChar) {
00087 str[0] = 0xC0 | (c >> 6);
00088 str[1] = 0x80 | (c & kMask);
00089 return 2;
00090 } else if (c <= kMaxThreeByteChar) {
00091 str[0] = 0xE0 | (c >> 12);
00092 str[1] = 0x80 | ((c >> 6) & kMask);
00093 str[2] = 0x80 | (c & kMask);
00094 return 3;
00095 } else {
00096 str[0] = 0xF0 | (c >> 18);
00097 str[1] = 0x80 | ((c >> 12) & kMask);
00098 str[2] = 0x80 | ((c >> 6) & kMask);
00099 str[3] = 0x80 | (c & kMask);
00100 return 4;
00101 }
00102 }
00103
00104
00105 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
00106 if (length <= 0) return kBadChar;
00107 byte first = bytes[0];
00108
00109 if (first <= kMaxOneByteChar) {
00110 *cursor += 1;
00111 return first;
00112 }
00113 return CalculateValue(bytes, length, cursor);
00114 }
00115
00116 unsigned Utf8::Length(uchar c) {
00117 if (c <= kMaxOneByteChar) {
00118 return 1;
00119 } else if (c <= kMaxTwoByteChar) {
00120 return 2;
00121 } else if (c <= kMaxThreeByteChar) {
00122 return 3;
00123 } else {
00124 return 4;
00125 }
00126 }
00127
00128 uchar CharacterStream::GetNext() {
00129 uchar result = DecodeCharacter(buffer_, &cursor_);
00130 if (remaining_ == 1) {
00131 cursor_ = 0;
00132 FillBuffer();
00133 } else {
00134 remaining_--;
00135 }
00136 return result;
00137 }
00138
00139 #if __BYTE_ORDER == __LITTLE_ENDIAN
00140 #define IF_LITTLE(expr) expr
00141 #define IF_BIG(expr) ((void) 0)
00142 #elif __BYTE_ORDER == __BIG_ENDIAN
00143 #define IF_LITTLE(expr) ((void) 0)
00144 #define IF_BIG(expr) expr
00145 #else
00146 #warning Unknown byte ordering
00147 #endif
00148
00149 bool CharacterStream::EncodeAsciiCharacter(uchar c, byte* buffer,
00150 unsigned capacity, unsigned& offset) {
00151 if (offset >= capacity) return false;
00152 buffer[offset] = c;
00153 offset += 1;
00154 return true;
00155 }
00156
00157 bool CharacterStream::EncodeNonAsciiCharacter(uchar c, byte* buffer,
00158 unsigned capacity, unsigned& offset) {
00159 unsigned aligned = (offset + 0x3) & ~0x3;
00160 if ((aligned + sizeof(uchar)) > capacity)
00161 return false;
00162 if (offset == aligned) {
00163 IF_LITTLE(*reinterpret_cast<uchar*>(buffer + aligned) = (c << 8) | 0x80);
00164 IF_BIG(*reinterpret_cast<uchar*>(buffer + aligned) = c | (1 << 31));
00165 } else {
00166 buffer[offset] = 0x80;
00167 IF_LITTLE(*reinterpret_cast<uchar*>(buffer + aligned) = c << 8);
00168 IF_BIG(*reinterpret_cast<uchar*>(buffer + aligned) = c);
00169 }
00170 offset = aligned + sizeof(uchar);
00171 return true;
00172 }
00173
00174 bool CharacterStream::EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
00175 unsigned& offset) {
00176 if (c <= Utf8::kMaxOneByteChar) {
00177 return EncodeAsciiCharacter(c, buffer, capacity, offset);
00178 } else {
00179 return EncodeNonAsciiCharacter(c, buffer, capacity, offset);
00180 }
00181 }
00182
00183 uchar CharacterStream::DecodeCharacter(const byte* buffer, unsigned* offset) {
00184 byte b = buffer[*offset];
00185 if (b <= Utf8::kMaxOneByteChar) {
00186 (*offset)++;
00187 return b;
00188 } else {
00189 unsigned aligned = (*offset + 0x3) & ~0x3;
00190 *offset = aligned + sizeof(uchar);
00191 IF_LITTLE(return *reinterpret_cast<const uchar*>(buffer + aligned) >> 8);
00192 IF_BIG(return *reinterpret_cast<const uchar*>(buffer + aligned) &
00193 ~(1 << 31));
00194 }
00195 }
00196
00197 #undef IF_LITTLE
00198 #undef IF_BIG
00199
00200 template <class R, class I, unsigned s>
00201 void InputBuffer<R, I, s>::FillBuffer() {
00202 buffer_ = R::ReadBlock(input_, util_buffer_, s, &remaining_, &offset_);
00203 }
00204
00205 template <class R, class I, unsigned s>
00206 void InputBuffer<R, I, s>::Rewind() {
00207 Reset(input_);
00208 }
00209
00210 template <class R, class I, unsigned s>
00211 void InputBuffer<R, I, s>::Reset(unsigned position, I input) {
00212 input_ = input;
00213 remaining_ = 0;
00214 cursor_ = 0;
00215 offset_ = position;
00216 buffer_ = R::ReadBlock(input_, util_buffer_, s, &remaining_, &offset_);
00217 }
00218
00219 template <class R, class I, unsigned s>
00220 void InputBuffer<R, I, s>::Reset(I input) {
00221 Reset(0, input);
00222 }
00223
00224 template <class R, class I, unsigned s>
00225 void InputBuffer<R, I, s>::Seek(unsigned position) {
00226 offset_ = position;
00227 buffer_ = R::ReadBlock(input_, util_buffer_, s, &remaining_, &offset_);
00228 }
00229
00230 template <unsigned s>
00231 Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length)
00232 : InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data,
00233 length)) {
00234 }
00235
00236 }
00237
00238 #endif // __UNIBROW_INL_H__