00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "v8.h"
00029
00030 #include "ast.h"
00031 #include "scanner.h"
00032
00033 namespace v8 { namespace internal {
00034
00035
00036
00037
00038
00039 unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
00040 unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
00041 unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
00042 unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
00043
00044
00045 StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
00046
00047
00048
00049
00050
00051 UTF8Buffer::UTF8Buffer() : data_(NULL) {
00052 Initialize(NULL, 0);
00053 }
00054
00055
00056 UTF8Buffer::~UTF8Buffer() {
00057 DeleteArray(data_);
00058 }
00059
00060
00061 void UTF8Buffer::Initialize(char* src, int length) {
00062 DeleteArray(data_);
00063 data_ = src;
00064 size_ = length;
00065 Reset();
00066 }
00067
00068
00069 void UTF8Buffer::AddChar(uc32 c) {
00070 const int min_size = 1024;
00071 if (pos_ + static_cast<int>(unibrow::Utf8::kMaxEncodedSize) > size_) {
00072 int new_size = size_ * 2;
00073 if (new_size < min_size) {
00074 new_size = min_size;
00075 }
00076 char* new_data = NewArray<char>(new_size);
00077 memcpy(new_data, data_, pos_);
00078 DeleteArray(data_);
00079 data_ = new_data;
00080 size_ = new_size;
00081 }
00082 if (static_cast<unsigned>(c) < unibrow::Utf8::kMaxOneByteChar) {
00083 data_[pos_++] = c;
00084 } else {
00085 pos_ += unibrow::Utf8::Encode(&data_[pos_], c);
00086 }
00087 ASSERT(pos_ <= size_);
00088 }
00089
00090
00091
00092
00093
00094
00095 UTF16Buffer::UTF16Buffer()
00096 : pos_(0),
00097 pushback_buffer_(0),
00098 last_(0),
00099 stream_(NULL) { }
00100
00101
00102 void UTF16Buffer::Initialize(Handle<String> data,
00103 unibrow::CharacterStream* input) {
00104 data_ = data;
00105 pos_ = 0;
00106 stream_ = input;
00107 }
00108
00109
00110 Handle<String> UTF16Buffer::SubString(int start, int end) {
00111 return internal::SubString(data_, start, end);
00112 }
00113
00114
00115 void UTF16Buffer::PushBack(uc32 ch) {
00116 pushback_buffer()->Add(last_);
00117 last_ = ch;
00118 pos_--;
00119 }
00120
00121
00122 uc32 UTF16Buffer::Advance() {
00123
00124
00125
00126
00127
00128
00129
00130
00131 if (!pushback_buffer()->is_empty()) {
00132 pos_++;
00133 return last_ = pushback_buffer()->RemoveLast();
00134 } else if (stream_->has_more()) {
00135 pos_++;
00136 uc32 next = stream_->GetNext();
00137 return last_ = next;
00138 } else {
00139
00140
00141 pos_++;
00142 return last_ = static_cast<uc32>(-1);
00143 }
00144 }
00145
00146
00147 void UTF16Buffer::SeekForward(int pos) {
00148 pos_ = pos;
00149 ASSERT(pushback_buffer()->is_empty());
00150 stream_->Seek(pos);
00151 }
00152
00153
00154
00155
00156
00157 Scanner::Scanner(bool pre) : stack_overflow_(false), is_pre_parsing_(pre) {
00158 Token::Initialize();
00159 }
00160
00161
00162 void Scanner::Init(Handle<String> source, unibrow::CharacterStream* stream,
00163 int position) {
00164
00165 source_.Initialize(source, stream);
00166 position_ = position;
00167
00168
00169 literals_.Reset();
00170
00171
00172 ASSERT(kCharacterLookaheadBufferSize == 1);
00173 Advance();
00174
00175
00176
00177 SkipWhiteSpace(true);
00178 Scan();
00179 }
00180
00181
00182 Handle<String> Scanner::SubString(int start, int end) {
00183 return source_.SubString(start - position_, end - position_);
00184 }
00185
00186
00187 Token::Value Scanner::Next() {
00188
00189
00190
00191 current_ = next_;
00192
00193 StackLimitCheck check;
00194 if (check.HasOverflowed()) {
00195 stack_overflow_ = true;
00196 next_.token = Token::ILLEGAL;
00197 } else {
00198 Scan();
00199 }
00200 return current_.token;
00201 }
00202
00203
00204 void Scanner::StartLiteral() {
00205 next_.literal_pos = literals_.pos();
00206 }
00207
00208
00209 void Scanner::AddChar(uc32 c) {
00210 literals_.AddChar(c);
00211 }
00212
00213
00214 void Scanner::TerminateLiteral() {
00215 next_.literal_end = literals_.pos();
00216 AddChar(0);
00217 }
00218
00219
00220 void Scanner::AddCharAdvance() {
00221 AddChar(c0_);
00222 Advance();
00223 }
00224
00225
00226 void Scanner::Advance() {
00227 c0_ = source_.Advance();
00228 }
00229
00230
00231 void Scanner::PushBack(uc32 ch) {
00232 source_.PushBack(ch);
00233 c0_ = ch;
00234 }
00235
00236
00237 void Scanner::SkipWhiteSpace(bool initial) {
00238 has_line_terminator_before_next_ = initial;
00239
00240 while (true) {
00241 while (kIsWhiteSpace.get(c0_)) {
00242
00243 if (kIsLineTerminator.get(c0_))
00244
00245
00246 has_line_terminator_before_next_ = true;
00247 Advance();
00248 }
00249
00250
00251
00252
00253
00254 if (c0_ == '-' && has_line_terminator_before_next_) {
00255 Advance();
00256 if (c0_ == '-') {
00257 Advance();
00258 if (c0_ == '>') {
00259
00260 SkipSingleLineComment();
00261
00262 continue;
00263 }
00264 PushBack('-');
00265 }
00266 PushBack('-');
00267 }
00268 return;
00269 }
00270 }
00271
00272
00273 Token::Value Scanner::SkipSingleLineComment() {
00274 Advance();
00275
00276
00277
00278
00279
00280
00281 while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
00282 Advance();
00283 }
00284
00285 return Token::COMMENT;
00286 }
00287
00288
00289 Token::Value Scanner::SkipMultiLineComment() {
00290 ASSERT(c0_ == '*');
00291 Advance();
00292
00293 while (c0_ >= 0) {
00294 char ch = c0_;
00295 Advance();
00296
00297
00298
00299
00300
00301
00302
00303 if (ch == '*' && c0_ == '/') {
00304 c0_ = ' ';
00305 return Token::COMMENT;
00306 }
00307 }
00308
00309
00310 return Token::ILLEGAL;
00311 }
00312
00313
00314 Token::Value Scanner::ScanHtmlComment() {
00315
00316 ASSERT(c0_ == '!');
00317 Advance();
00318 if (c0_ == '-') {
00319 Advance();
00320 if (c0_ == '-') return SkipSingleLineComment();
00321 PushBack('-');
00322 }
00323 PushBack('!');
00324 ASSERT(c0_ == '!');
00325 return Token::LT;
00326 }
00327
00328
00329 void Scanner::Scan() {
00330 Token::Value token;
00331 bool has_line_terminator = false;
00332 do {
00333 SkipWhiteSpace(has_line_terminator);
00334
00335
00336 has_line_terminator = has_line_terminator_before_next();
00337
00338
00339 next_.location.beg_pos = source_pos();
00340
00341 token = ScanToken();
00342 } while (token == Token::COMMENT);
00343
00344 next_.location.end_pos = source_pos();
00345 next_.token = token;
00346 }
00347
00348
00349 void Scanner::SeekForward(int pos) {
00350 source_.SeekForward(pos - 1);
00351 Advance();
00352 Scan();
00353 }
00354
00355
00356 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
00357 ASSERT(length <= 4);
00358
00359 uc32 digits[4];
00360 uc32 x = 0;
00361 for (int i = 0; i < length; i++) {
00362 digits[i] = c0_;
00363 int d = HexValue(c0_);
00364 if (d < 0) {
00365
00366
00367
00368
00369
00370 for (int j = i-1; j >= 0; j--) {
00371 PushBack(digits[j]);
00372 }
00373
00374 return c;
00375 }
00376 x = x * 16 + d;
00377 Advance();
00378 }
00379
00380 return x;
00381 }
00382
00383
00384
00385
00386 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
00387 uc32 x = c - '0';
00388 for (int i = 0; i < length; i++) {
00389 int d = c0_ - '0';
00390 if (d < 0 || d > 7) break;
00391 int nx = x * 8 + d;
00392 if (nx >= 256) break;
00393 x = nx;
00394 Advance();
00395 }
00396 return x;
00397 }
00398
00399
00400 void Scanner::ScanEscape() {
00401 uc32 c = c0_;
00402 Advance();
00403
00404
00405 if (kIsLineTerminator.get(c)) {
00406
00407 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
00408
00409 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
00410 return;
00411 }
00412
00413 switch (c) {
00414 case '\'':
00415 case '"' :
00416 case '\\': break;
00417 case 'b' : c = '\b'; break;
00418 case 'f' : c = '\f'; break;
00419 case 'n' : c = '\n'; break;
00420 case 'r' : c = '\r'; break;
00421 case 't' : c = '\t'; break;
00422 case 'u' : c = ScanHexEscape(c, 4); break;
00423 case 'v' : c = '\v'; break;
00424 case 'x' : c = ScanHexEscape(c, 2); break;
00425 case '0' :
00426 case '1' :
00427 case '2' :
00428 case '3' :
00429 case '4' :
00430 case '5' :
00431 case '6' :
00432 case '7' : c = ScanOctalEscape(c, 2); break;
00433 }
00434
00435
00436
00437
00438 AddChar(c);
00439 }
00440
00441
00442 Token::Value Scanner::ScanString() {
00443 uc32 quote = c0_;
00444 Advance();
00445
00446 StartLiteral();
00447 while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
00448 uc32 c = c0_;
00449 Advance();
00450 if (c == '\\') {
00451 if (c0_ < 0) return Token::ILLEGAL;
00452 ScanEscape();
00453 } else {
00454 AddChar(c);
00455 }
00456 }
00457 if (c0_ != quote) {
00458 return Token::ILLEGAL;
00459 }
00460 TerminateLiteral();
00461
00462 Advance();
00463 return Token::STRING;
00464 }
00465
00466
00467 Token::Value Scanner::Select(Token::Value tok) {
00468 Advance();
00469 return tok;
00470 }
00471
00472
00473 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
00474 Advance();
00475 if (c0_ == next) {
00476 Advance();
00477 return then;
00478 } else {
00479 return else_;
00480 }
00481 }
00482
00483
00484 Token::Value Scanner::ScanToken() {
00485 switch (c0_) {
00486
00487 case '"': case '\'':
00488 return ScanString();
00489
00490 case '<':
00491
00492 Advance();
00493 if (c0_ == '=') return Select(Token::LTE);
00494 if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL);
00495 if (c0_ == '!') return ScanHtmlComment();
00496 return Token::LT;
00497
00498 case '>':
00499
00500 Advance();
00501 if (c0_ == '=') return Select(Token::GTE);
00502 if (c0_ == '>') {
00503
00504 Advance();
00505 if (c0_ == '=') return Select(Token::ASSIGN_SAR);
00506 if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR);
00507 return Token::SAR;
00508 }
00509 return Token::GT;
00510
00511 case '=':
00512
00513 Advance();
00514 if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ);
00515 return Token::ASSIGN;
00516
00517 case '!':
00518
00519 Advance();
00520 if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE);
00521 return Token::NOT;
00522
00523 case '+':
00524
00525 Advance();
00526 if (c0_ == '+') return Select(Token::INC);
00527 if (c0_ == '=') return Select(Token::ASSIGN_ADD);
00528 return Token::ADD;
00529
00530 case '-':
00531
00532 Advance();
00533 if (c0_ == '-') return Select(Token::DEC);
00534 if (c0_ == '=') return Select(Token::ASSIGN_SUB);
00535 return Token::SUB;
00536
00537 case '*':
00538
00539 return Select('=', Token::ASSIGN_MUL, Token::MUL);
00540
00541 case '%':
00542
00543 return Select('=', Token::ASSIGN_MOD, Token::MOD);
00544
00545 case '/':
00546
00547 Advance();
00548 if (c0_ == '/') return SkipSingleLineComment();
00549 if (c0_ == '*') return SkipMultiLineComment();
00550 if (c0_ == '=') return Select(Token::ASSIGN_DIV);
00551 return Token::DIV;
00552
00553 case '&':
00554
00555 Advance();
00556 if (c0_ == '&') return Select(Token::AND);
00557 if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND);
00558 return Token::BIT_AND;
00559
00560 case '|':
00561
00562 Advance();
00563 if (c0_ == '|') return Select(Token::OR);
00564 if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR);
00565 return Token::BIT_OR;
00566
00567 case '^':
00568
00569 return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
00570
00571 case '.':
00572
00573 Advance();
00574 if (IsDecimalDigit(c0_)) return ScanNumber(true);
00575 return Token::PERIOD;
00576
00577 case ':':
00578 return Select(Token::COLON);
00579
00580 case ';':
00581 return Select(Token::SEMICOLON);
00582
00583 case ',':
00584 return Select(Token::COMMA);
00585
00586 case '(':
00587 return Select(Token::LPAREN);
00588
00589 case ')':
00590 return Select(Token::RPAREN);
00591
00592 case '[':
00593 return Select(Token::LBRACK);
00594
00595 case ']':
00596 return Select(Token::RBRACK);
00597
00598 case '{':
00599 return Select(Token::LBRACE);
00600
00601 case '}':
00602 return Select(Token::RBRACE);
00603
00604 case '?':
00605 return Select(Token::CONDITIONAL);
00606
00607 case '~':
00608 return Select(Token::BIT_NOT);
00609
00610 default:
00611 if (kIsIdentifierStart.get(c0_))
00612 return ScanIdentifier();
00613 if (IsDecimalDigit(c0_))
00614 return ScanNumber(false);
00615 if (c0_ < 0)
00616 return Token::EOS;
00617 return Select(Token::ILLEGAL);
00618 }
00619
00620 UNREACHABLE();
00621 return Token::ILLEGAL;
00622 }
00623
00624
00625
00626 void Scanner::ScanDecimalDigits() {
00627 while (IsDecimalDigit(c0_))
00628 AddCharAdvance();
00629 }
00630
00631
00632 Token::Value Scanner::ScanNumber(bool seen_period) {
00633 ASSERT(IsDecimalDigit(c0_));
00634
00635 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
00636
00637 StartLiteral();
00638 if (seen_period) {
00639
00640 AddChar('.');
00641 ScanDecimalDigits();
00642
00643 } else {
00644
00645 if (c0_ == '0') {
00646 AddCharAdvance();
00647
00648
00649 if (c0_ == 'x' || c0_ == 'X') {
00650
00651 kind = HEX;
00652 AddCharAdvance();
00653 if (!IsHexDigit(c0_))
00654
00655 return Token::ILLEGAL;
00656 while (IsHexDigit(c0_))
00657 AddCharAdvance();
00658
00659 } else if ('0' <= c0_ && c0_ <= '7') {
00660
00661 kind = OCTAL;
00662 while (true) {
00663 if (c0_ == '8' || c0_ == '9') {
00664 kind = DECIMAL;
00665 break;
00666 }
00667 if (c0_ < '0' || '7' < c0_) break;
00668 AddCharAdvance();
00669 }
00670 }
00671 }
00672
00673
00674 if (kind == DECIMAL) {
00675 ScanDecimalDigits();
00676 if (c0_ == '.') {
00677 AddCharAdvance();
00678 ScanDecimalDigits();
00679 }
00680 }
00681 }
00682
00683
00684 if (c0_ == 'e' || c0_ == 'E') {
00685 ASSERT(kind != HEX);
00686 if (kind == OCTAL) return Token::ILLEGAL;
00687
00688 AddCharAdvance();
00689 if (c0_ == '+' || c0_ == '-')
00690 AddCharAdvance();
00691 if (!IsDecimalDigit(c0_))
00692
00693 return Token::ILLEGAL;
00694 ScanDecimalDigits();
00695 }
00696 TerminateLiteral();
00697
00698
00699
00700
00701
00702 if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
00703 return Token::ILLEGAL;
00704
00705 return Token::NUMBER;
00706 }
00707
00708
00709 uc32 Scanner::ScanIdentifierUnicodeEscape() {
00710 Advance();
00711 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
00712 Advance();
00713 uc32 c = ScanHexEscape('u', 4);
00714
00715
00716 if (c == '\\') return unibrow::Utf8::kBadChar;
00717 return c;
00718 }
00719
00720
00721 Token::Value Scanner::ScanIdentifier() {
00722 ASSERT(kIsIdentifierStart.get(c0_));
00723
00724 bool has_escapes = false;
00725
00726 StartLiteral();
00727
00728 if (c0_ == '\\') {
00729 has_escapes = true;
00730 uc32 c = ScanIdentifierUnicodeEscape();
00731
00732 if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
00733 AddChar(c);
00734 } else {
00735 AddCharAdvance();
00736 }
00737
00738 while (kIsIdentifierPart.get(c0_)) {
00739 if (c0_ == '\\') {
00740 has_escapes = true;
00741 uc32 c = ScanIdentifierUnicodeEscape();
00742
00743 if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
00744 AddChar(c);
00745 } else {
00746 AddCharAdvance();
00747 }
00748 }
00749 TerminateLiteral();
00750
00751
00752 if ((next_.literal_end - next_.literal_pos) == 1)
00753 return Token::IDENTIFIER;
00754
00755
00756
00757 if (has_escapes)
00758 return Token::IDENTIFIER;
00759
00760 return Token::Lookup(&literals_.data()[next_.literal_pos]);
00761 }
00762
00763
00764
00765 bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
00766
00767 if (!buffer->has_more()) return false;
00768 if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
00769 while (buffer->has_more()) {
00770 if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
00771 }
00772 return true;
00773 }
00774
00775
00776 bool Scanner::ScanRegExpPattern(bool seen_equal) {
00777
00778 bool in_character_class = false;
00779
00780
00781
00782 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
00783 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
00784
00785
00786
00787
00788 StartLiteral();
00789 if (seen_equal)
00790 AddChar('=');
00791
00792 while (c0_ != '/' || in_character_class) {
00793 if (kIsLineTerminator.get(c0_) || c0_ < 0)
00794 return false;
00795 if (c0_ == '\\') {
00796 AddCharAdvance();
00797 if (kIsLineTerminator.get(c0_) || c0_ < 0)
00798 return false;
00799 AddCharAdvance();
00800 } else {
00801 if (c0_ == '[')
00802 in_character_class = true;
00803 if (c0_ == ']')
00804 in_character_class = false;
00805 AddCharAdvance();
00806 }
00807 }
00808 Advance();
00809
00810 TerminateLiteral();
00811
00812 return true;
00813 }
00814
00815 bool Scanner::ScanRegExpFlags() {
00816
00817 StartLiteral();
00818 while (kIsIdentifierPart.get(c0_))
00819 AddCharAdvance();
00820 TerminateLiteral();
00821
00822 next_.location.end_pos = source_pos() - 1;
00823 return true;
00824 }
00825
00826 } }