V8: src/scanner.cc ソースファイル

説明を見る。
00001 // Copyright 2006-2008 the V8 project authors. All rights reserved.
00002 // Redistribution and use in source and binary forms, with or without
00003 // modification, are permitted provided that the following conditions are
00004 // met:
00005 //
00006 //     * Redistributions of source code must retain the above copyright
00007 //       notice, this list of conditions and the following disclaimer.
00008 //     * Redistributions in binary form must reproduce the above
00009 //       copyright notice, this list of conditions and the following
00010 //       disclaimer in the documentation and/or other materials provided
00011 //       with the distribution.
00012 //     * Neither the name of Google Inc. nor the names of its
00013 //       contributors may be used to endorse or promote products derived
00014 //       from this software without specific prior written permission.
00015 //
00016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00019 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00020 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00021 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00022 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00023 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00024 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00025 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00026 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00027 
00028 #include "v8.h"
00029 
00030 #include "ast.h"
00031 #include "scanner.h"
00032 
00033 namespace v8 { namespace internal {
00034 
00035 // ----------------------------------------------------------------------------
00036 // Character predicates
00037 
00038 
00039 unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
00040 unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
00041 unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
00042 unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
00043 
00044 
00045 StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
00046 
00047 
00048 // ----------------------------------------------------------------------------
00049 // UTF8Buffer
00050 
00051 UTF8Buffer::UTF8Buffer() : data_(NULL) {
00052   Initialize(NULL, 0);
00053 }
00054 
00055 
00056 UTF8Buffer::~UTF8Buffer() {
00057   DeleteArray(data_);
00058 }
00059 
00060 
00061 void UTF8Buffer::Initialize(char* src, int length) {
00062   DeleteArray(data_);
00063   data_ = src;
00064   size_ = length;
00065   Reset();
00066 }
00067 
00068 
00069 void UTF8Buffer::AddChar(uc32 c) {
00070   const int min_size = 1024;
00071   if (pos_ + static_cast<int>(unibrow::Utf8::kMaxEncodedSize) > size_) {
00072     int new_size = size_ * 2;
00073     if (new_size < min_size) {
00074       new_size = min_size;
00075     }
00076     char* new_data = NewArray<char>(new_size);
00077     memcpy(new_data, data_, pos_);
00078     DeleteArray(data_);
00079     data_ = new_data;
00080     size_ = new_size;
00081   }
00082   if (static_cast<unsigned>(c) < unibrow::Utf8::kMaxOneByteChar) {
00083     data_[pos_++] = c;  // common case: 7bit ASCII
00084   } else {
00085     pos_ += unibrow::Utf8::Encode(&data_[pos_], c);
00086   }
00087   ASSERT(pos_ <= size_);
00088 }
00089 
00090 
00091 // ----------------------------------------------------------------------------
00092 // UTF16Buffer
00093 
00094 
00095 UTF16Buffer::UTF16Buffer()
00096   : pos_(0),
00097     pushback_buffer_(0),
00098     last_(0),
00099     stream_(NULL) { }
00100 
00101 
00102 void UTF16Buffer::Initialize(Handle<String> data,
00103                              unibrow::CharacterStream* input) {
00104   data_ = data;
00105   pos_ = 0;
00106   stream_ = input;
00107 }
00108 
00109 
00110 Handle<String> UTF16Buffer::SubString(int start, int end) {
00111   return internal::SubString(data_, start, end);
00112 }
00113 
00114 
00115 void UTF16Buffer::PushBack(uc32 ch) {
00116   pushback_buffer()->Add(last_);
00117   last_ = ch;
00118   pos_--;
00119 }
00120 
00121 
00122 uc32 UTF16Buffer::Advance() {
00123   // NOTE: It is of importance to Persian / Farsi resources that we do
00124   // *not* strip format control characters in the scanner; see
00125   //
00126   //    https://bugzilla.mozilla.org/show_bug.cgi?id=274152
00127   //
00128   // So, even though ECMA-262, section 7.1, page 11, dictates that we
00129   // must remove Unicode format-control characters, we do not. This is
00130   // in line with how IE and SpiderMonkey handles it.
00131   if (!pushback_buffer()->is_empty()) {
00132     pos_++;
00133     return last_ = pushback_buffer()->RemoveLast();
00134   } else if (stream_->has_more()) {
00135     pos_++;
00136     uc32 next = stream_->GetNext();
00137     return last_ = next;
00138   } else {
00139     // note: currently the following increment is necessary to avoid a
00140     // test-parser problem!
00141     pos_++;
00142     return last_ = static_cast<uc32>(-1);
00143   }
00144 }
00145 
00146 
00147 void UTF16Buffer::SeekForward(int pos) {
00148   pos_ = pos;
00149   ASSERT(pushback_buffer()->is_empty());
00150   stream_->Seek(pos);
00151 }
00152 
00153 
00154 // ----------------------------------------------------------------------------
00155 // Scanner
00156 
00157 Scanner::Scanner(bool pre) : stack_overflow_(false), is_pre_parsing_(pre) {
00158   Token::Initialize();
00159 }
00160 
00161 
00162 void Scanner::Init(Handle<String> source, unibrow::CharacterStream* stream,
00163     int position) {
00164   // Initialize the source buffer.
00165   source_.Initialize(source, stream);
00166   position_ = position;
00167 
00168   // Reset literals buffer
00169   literals_.Reset();
00170 
00171   // Set c0_ (one character ahead)
00172   ASSERT(kCharacterLookaheadBufferSize == 1);
00173   Advance();
00174 
00175   // Skip initial whitespace (allowing HTML comment ends) and scan
00176   // first token.
00177   SkipWhiteSpace(true);
00178   Scan();
00179 }
00180 
00181 
00182 Handle<String> Scanner::SubString(int start, int end) {
00183   return source_.SubString(start - position_, end - position_);
00184 }
00185 
00186 
00187 Token::Value Scanner::Next() {
00188   // BUG 1215673: Find a thread safe way to set a stack limit in
00189   // pre-parse mode. Otherwise, we cannot safely pre-parse from other
00190   // threads.
00191   current_ = next_;
00192   // Check for stack-overflow before returning any tokens.
00193   StackLimitCheck check;
00194   if (check.HasOverflowed()) {
00195     stack_overflow_ = true;
00196     next_.token = Token::ILLEGAL;
00197   } else {
00198     Scan();
00199   }
00200   return current_.token;
00201 }
00202 
00203 
00204 void Scanner::StartLiteral() {
00205   next_.literal_pos = literals_.pos();
00206 }
00207 
00208 
00209 void Scanner::AddChar(uc32 c) {
00210   literals_.AddChar(c);
00211 }
00212 
00213 
00214 void Scanner::TerminateLiteral() {
00215   next_.literal_end = literals_.pos();
00216   AddChar(0);
00217 }
00218 
00219 
00220 void Scanner::AddCharAdvance() {
00221   AddChar(c0_);
00222   Advance();
00223 }
00224 
00225 
00226 void Scanner::Advance() {
00227   c0_ = source_.Advance();
00228 }
00229 
00230 
00231 void Scanner::PushBack(uc32 ch) {
00232   source_.PushBack(ch);
00233   c0_ = ch;
00234 }
00235 
00236 
00237 void Scanner::SkipWhiteSpace(bool initial) {
00238   has_line_terminator_before_next_ = initial;
00239 
00240   while (true) {
00241     while (kIsWhiteSpace.get(c0_)) {
00242       // IsWhiteSpace() includes line terminators!
00243       if (kIsLineTerminator.get(c0_))
00244         // Ignore line terminators, but remember them. This is necessary
00245         // for automatic semicolon insertion.
00246         has_line_terminator_before_next_ = true;
00247       Advance();
00248     }
00249 
00250     // If there is an HTML comment end '-->' at the beginning of a
00251     // line (with only whitespace in front of it), we treat the rest
00252     // of the line as a comment. This is in line with the way
00253     // SpiderMonkey handles it.
00254     if (c0_ == '-' && has_line_terminator_before_next_) {
00255       Advance();
00256       if (c0_ == '-') {
00257         Advance();
00258         if (c0_ == '>') {
00259           // Treat the rest of the line as a comment.
00260           SkipSingleLineComment();
00261           // Continue skipping white space after the comment.
00262           continue;
00263         }
00264         PushBack('-');  // undo Advance()
00265       }
00266       PushBack('-');  // undo Advance()
00267     }
00268     return;
00269   }
00270 }
00271 
00272 
00273 Token::Value Scanner::SkipSingleLineComment() {
00274   Advance();
00275 
00276   // The line terminator at the end of the line is not considered
00277   // to be part of the single-line comment; it is recognized
00278   // separately by the lexical grammar and becomes part of the
00279   // stream of input elements for the syntactic grammar (see
00280   // ECMA-262, section 7.4, page 12).
00281   while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
00282     Advance();
00283   }
00284 
00285   return Token::COMMENT;
00286 }
00287 
00288 
00289 Token::Value Scanner::SkipMultiLineComment() {
00290   ASSERT(c0_ == '*');
00291   Advance();
00292 
00293   while (c0_ >= 0) {
00294     char ch = c0_;
00295     Advance();
00296     // If we have reached the end of the multi-line comment, we
00297     // consume the '/' and insert a whitespace. This way all
00298     // multi-line comments are treated as whitespace - even the ones
00299     // containing line terminators. This contradicts ECMA-262, section
00300     // 7.4, page 12, that says that multi-line comments containing
00301     // line terminators should be treated as a line terminator, but it
00302     // matches the behaviour of SpiderMonkey and KJS.
00303     if (ch == '*' && c0_ == '/') {
00304       c0_ = ' ';
00305       return Token::COMMENT;
00306     }
00307   }
00308 
00309   // Unterminated multi-line comment.
00310   return Token::ILLEGAL;
00311 }
00312 
00313 
00314 Token::Value Scanner::ScanHtmlComment() {
00315   // Check for <!-- comments.
00316   ASSERT(c0_ == '!');
00317   Advance();
00318   if (c0_ == '-') {
00319     Advance();
00320     if (c0_ == '-') return SkipSingleLineComment();
00321     PushBack('-');  // undo Advance()
00322   }
00323   PushBack('!');  // undo Advance()
00324   ASSERT(c0_ == '!');
00325   return Token::LT;
00326 }
00327 
00328 
00329 void Scanner::Scan() {
00330   Token::Value token;
00331   bool has_line_terminator = false;
00332   do {
00333     SkipWhiteSpace(has_line_terminator);
00334 
00335     // Remember the line terminator in previous loop
00336     has_line_terminator = has_line_terminator_before_next();
00337 
00338     // Remember the position of the next token
00339     next_.location.beg_pos = source_pos();
00340 
00341     token = ScanToken();
00342   } while (token == Token::COMMENT);
00343 
00344   next_.location.end_pos = source_pos();
00345   next_.token = token;
00346 }
00347 
00348 
00349 void Scanner::SeekForward(int pos) {
00350   source_.SeekForward(pos - 1);
00351   Advance();
00352   Scan();
00353 }
00354 
00355 
00356 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
00357   ASSERT(length <= 4);  // prevent overflow
00358 
00359   uc32 digits[4];
00360   uc32 x = 0;
00361   for (int i = 0; i < length; i++) {
00362     digits[i] = c0_;
00363     int d = HexValue(c0_);
00364     if (d < 0) {
00365       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
00366       // should be illegal, but other JS VMs just return the
00367       // non-escaped version of the original character.
00368 
00369       // Push back digits read, except the last one (in c0_).
00370       for (int j = i-1; j >= 0; j--) {
00371         PushBack(digits[j]);
00372       }
00373 
00374       return c;
00375     }
00376     x = x * 16 + d;
00377     Advance();
00378   }
00379 
00380   return x;
00381 }
00382 
00383 
00384 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
00385 // ECMA-262. Other JS VMs support them.
00386 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
00387   uc32 x = c - '0';
00388   for (int i = 0; i < length; i++) {
00389     int d = c0_ - '0';
00390     if (d < 0 || d > 7) break;
00391     int nx = x * 8 + d;
00392     if (nx >= 256) break;
00393     x = nx;
00394     Advance();
00395   }
00396   return x;
00397 }
00398 
00399 
00400 void Scanner::ScanEscape() {
00401   uc32 c = c0_;
00402   Advance();
00403 
00404   // Skip escaped newlines.
00405   if (kIsLineTerminator.get(c)) {
00406     // Allow CR+LF newlines in multiline string literals.
00407     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
00408     // Allow LF+CR newlines in multiline string literals.
00409     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
00410     return;
00411   }
00412 
00413   switch (c) {
00414     case '\'':  // fall through
00415     case '"' :  // fall through
00416     case '\\': break;
00417     case 'b' : c = '\b'; break;
00418     case 'f' : c = '\f'; break;
00419     case 'n' : c = '\n'; break;
00420     case 'r' : c = '\r'; break;
00421     case 't' : c = '\t'; break;
00422     case 'u' : c = ScanHexEscape(c, 4); break;
00423     case 'v' : c = '\v'; break;
00424     case 'x' : c = ScanHexEscape(c, 2); break;
00425     case '0' :  // fall through
00426     case '1' :  // fall through
00427     case '2' :  // fall through
00428     case '3' :  // fall through
00429     case '4' :  // fall through
00430     case '5' :  // fall through
00431     case '6' :  // fall through
00432     case '7' : c = ScanOctalEscape(c, 2); break;
00433   }
00434 
00435   // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
00436   // should be illegal, but they are commonly handled
00437   // as non-escaped characters by JS VMs.
00438   AddChar(c);
00439 }
00440 
00441 
00442 Token::Value Scanner::ScanString() {
00443   uc32 quote = c0_;
00444   Advance();  // consume quote
00445 
00446   StartLiteral();
00447   while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
00448     uc32 c = c0_;
00449     Advance();
00450     if (c == '\\') {
00451       if (c0_ < 0) return Token::ILLEGAL;
00452       ScanEscape();
00453     } else {
00454       AddChar(c);
00455     }
00456   }
00457   if (c0_ != quote) {
00458     return Token::ILLEGAL;
00459   }
00460   TerminateLiteral();
00461 
00462   Advance();  // consume quote
00463   return Token::STRING;
00464 }
00465 
00466 
00467 Token::Value Scanner::Select(Token::Value tok) {
00468   Advance();
00469   return tok;
00470 }
00471 
00472 
00473 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
00474   Advance();
00475   if (c0_ == next) {
00476     Advance();
00477     return then;
00478   } else {
00479     return else_;
00480   }
00481 }
00482 
00483 
00484 Token::Value Scanner::ScanToken() {
00485   switch (c0_) {
00486     // strings
00487     case '"': case '\'':
00488       return ScanString();
00489 
00490     case '<':
00491       // < <= << <<= <!--
00492       Advance();
00493       if (c0_ == '=') return Select(Token::LTE);
00494       if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL);
00495       if (c0_ == '!') return ScanHtmlComment();
00496       return Token::LT;
00497 
00498     case '>':
00499       // > >= >> >>= >>> >>>=
00500       Advance();
00501       if (c0_ == '=') return Select(Token::GTE);
00502       if (c0_ == '>') {
00503         // >> >>= >>> >>>=
00504         Advance();
00505         if (c0_ == '=') return Select(Token::ASSIGN_SAR);
00506         if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR);
00507         return Token::SAR;
00508       }
00509       return Token::GT;
00510 
00511     case '=':
00512       // = == ===
00513       Advance();
00514       if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ);
00515       return Token::ASSIGN;
00516 
00517     case '!':
00518       // ! != !==
00519       Advance();
00520       if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE);
00521       return Token::NOT;
00522 
00523     case '+':
00524       // + ++ +=
00525       Advance();
00526       if (c0_ == '+') return Select(Token::INC);
00527       if (c0_ == '=') return Select(Token::ASSIGN_ADD);
00528       return Token::ADD;
00529 
00530     case '-':
00531       // - -- -=
00532       Advance();
00533       if (c0_ == '-') return Select(Token::DEC);
00534       if (c0_ == '=') return Select(Token::ASSIGN_SUB);
00535       return Token::SUB;
00536 
00537     case '*':
00538       // * *=
00539       return Select('=', Token::ASSIGN_MUL, Token::MUL);
00540 
00541     case '%':
00542       // % %=
00543       return Select('=', Token::ASSIGN_MOD, Token::MOD);
00544 
00545     case '/':
00546       // /  // /* /=
00547       Advance();
00548       if (c0_ == '/') return SkipSingleLineComment();
00549       if (c0_ == '*') return SkipMultiLineComment();
00550       if (c0_ == '=') return Select(Token::ASSIGN_DIV);
00551       return Token::DIV;
00552 
00553     case '&':
00554       // & && &=
00555       Advance();
00556       if (c0_ == '&') return Select(Token::AND);
00557       if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND);
00558       return Token::BIT_AND;
00559 
00560     case '|':
00561       // | || |=
00562       Advance();
00563       if (c0_ == '|') return Select(Token::OR);
00564       if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR);
00565       return Token::BIT_OR;
00566 
00567     case '^':
00568       // ^ ^=
00569       return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
00570 
00571     case '.':
00572       // . Number
00573       Advance();
00574       if (IsDecimalDigit(c0_)) return ScanNumber(true);
00575       return Token::PERIOD;
00576 
00577     case ':':
00578       return Select(Token::COLON);
00579 
00580     case ';':
00581       return Select(Token::SEMICOLON);
00582 
00583     case ',':
00584       return Select(Token::COMMA);
00585 
00586     case '(':
00587       return Select(Token::LPAREN);
00588 
00589     case ')':
00590       return Select(Token::RPAREN);
00591 
00592     case '[':
00593       return Select(Token::LBRACK);
00594 
00595     case ']':
00596       return Select(Token::RBRACK);
00597 
00598     case '{':
00599       return Select(Token::LBRACE);
00600 
00601     case '}':
00602       return Select(Token::RBRACE);
00603 
00604     case '?':
00605       return Select(Token::CONDITIONAL);
00606 
00607     case '~':
00608       return Select(Token::BIT_NOT);
00609 
00610     default:
00611       if (kIsIdentifierStart.get(c0_))
00612         return ScanIdentifier();
00613       if (IsDecimalDigit(c0_))
00614         return ScanNumber(false);
00615       if (c0_ < 0)
00616         return Token::EOS;
00617       return Select(Token::ILLEGAL);
00618   }
00619 
00620   UNREACHABLE();
00621   return Token::ILLEGAL;
00622 }
00623 
00624 
00625 // Returns true if any decimal digits were scanned, returns false otherwise.
00626 void Scanner::ScanDecimalDigits() {
00627   while (IsDecimalDigit(c0_))
00628     AddCharAdvance();
00629 }
00630 
00631 
00632 Token::Value Scanner::ScanNumber(bool seen_period) {
00633   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
00634 
00635   enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
00636 
00637   StartLiteral();
00638   if (seen_period) {
00639     // we have already seen a decimal point of the float
00640     AddChar('.');
00641     ScanDecimalDigits();  // we know we have at least one digit
00642 
00643   } else {
00644     // if the first character is '0' we must check for octals and hex
00645     if (c0_ == '0') {
00646       AddCharAdvance();
00647 
00648       // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
00649       if (c0_ == 'x' || c0_ == 'X') {
00650         // hex number
00651         kind = HEX;
00652         AddCharAdvance();
00653         if (!IsHexDigit(c0_))
00654           // we must have at least one hex digit after 'x'/'X'
00655           return Token::ILLEGAL;
00656         while (IsHexDigit(c0_))
00657           AddCharAdvance();
00658 
00659       } else if ('0' <= c0_ && c0_ <= '7') {
00660         // (possible) octal number
00661         kind = OCTAL;
00662         while (true) {
00663           if (c0_ == '8' || c0_ == '9') {
00664             kind = DECIMAL;
00665             break;
00666           }
00667           if (c0_  < '0' || '7'  < c0_) break;
00668           AddCharAdvance();
00669         }
00670       }
00671     }
00672 
00673     // Parse decimal digits and allow trailing fractional part.
00674     if (kind == DECIMAL) {
00675       ScanDecimalDigits();  // optional
00676       if (c0_ == '.') {
00677         AddCharAdvance();
00678         ScanDecimalDigits();  // optional
00679       }
00680     }
00681   }
00682 
00683   // scan exponent, if any
00684   if (c0_ == 'e' || c0_ == 'E') {
00685     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
00686     if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
00687     // scan exponent
00688     AddCharAdvance();
00689     if (c0_ == '+' || c0_ == '-')
00690       AddCharAdvance();
00691     if (!IsDecimalDigit(c0_))
00692       // we must have at least one decimal digit after 'e'/'E'
00693       return Token::ILLEGAL;
00694     ScanDecimalDigits();
00695   }
00696   TerminateLiteral();
00697 
00698   // The source character immediately following a numeric literal must
00699   // not be an identifier start or a decimal digit; see ECMA-262
00700   // section 7.8.3, page 17 (note that we read only one decimal digit
00701   // if the value is 0).
00702   if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
00703     return Token::ILLEGAL;
00704 
00705   return Token::NUMBER;
00706 }
00707 
00708 
00709 uc32 Scanner::ScanIdentifierUnicodeEscape() {
00710   Advance();
00711   if (c0_ != 'u') return unibrow::Utf8::kBadChar;
00712   Advance();
00713   uc32 c = ScanHexEscape('u', 4);
00714   // We do not allow a unicode escape sequence to start another
00715   // unicode escape sequence.
00716   if (c == '\\') return unibrow::Utf8::kBadChar;
00717   return c;
00718 }
00719 
00720 
00721 Token::Value Scanner::ScanIdentifier() {
00722   ASSERT(kIsIdentifierStart.get(c0_));
00723 
00724   bool has_escapes = false;
00725 
00726   StartLiteral();
00727   // Scan identifier start character.
00728   if (c0_ == '\\') {
00729     has_escapes = true;
00730     uc32 c = ScanIdentifierUnicodeEscape();
00731     // Only allow legal identifier start characters.
00732     if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
00733     AddChar(c);
00734   } else {
00735     AddCharAdvance();
00736   }
00737   // Scan the rest of the identifier characters.
00738   while (kIsIdentifierPart.get(c0_)) {
00739     if (c0_ == '\\') {
00740       has_escapes = true;
00741       uc32 c = ScanIdentifierUnicodeEscape();
00742       // Only allow legal identifier part characters.
00743       if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
00744       AddChar(c);
00745     } else {
00746       AddCharAdvance();
00747     }
00748   }
00749   TerminateLiteral();
00750 
00751   // We don't have any 1-letter keywords (this is probably a common case).
00752   if ((next_.literal_end - next_.literal_pos) == 1)
00753     return Token::IDENTIFIER;
00754 
00755   // If the identifier contains unicode escapes, it must not be
00756   // resolved to a keyword.
00757   if (has_escapes)
00758     return Token::IDENTIFIER;
00759 
00760   return Token::Lookup(&literals_.data()[next_.literal_pos]);
00761 }
00762 
00763 
00764 
00765 bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
00766   // Checks whether the buffer contains an identifier (no escapse).
00767   if (!buffer->has_more()) return false;
00768   if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
00769   while (buffer->has_more()) {
00770     if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
00771   }
00772   return true;
00773 }
00774 
00775 
00776 bool Scanner::ScanRegExpPattern(bool seen_equal) {
00777   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
00778   bool in_character_class = false;
00779 
00780   // Previous token is either '/' or '/=', in the second case, the
00781   // pattern starts at =.
00782   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
00783   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
00784 
00785   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
00786   // the scanner should pass uninterpreted bodies to the RegExp
00787   // constructor.
00788   StartLiteral();
00789   if (seen_equal)
00790     AddChar('=');
00791 
00792   while (c0_ != '/' || in_character_class) {
00793     if (kIsLineTerminator.get(c0_) || c0_ < 0)
00794       return false;
00795     if (c0_ == '\\') {  // escaped character
00796       AddCharAdvance();
00797       if (kIsLineTerminator.get(c0_) || c0_ < 0)
00798         return false;
00799       AddCharAdvance();
00800     } else {  // unescaped character
00801       if (c0_ == '[')
00802         in_character_class = true;
00803       if (c0_ == ']')
00804         in_character_class = false;
00805       AddCharAdvance();
00806     }
00807   }
00808   Advance();  // consume '/'
00809 
00810   TerminateLiteral();
00811 
00812   return true;
00813 }
00814 
00815 bool Scanner::ScanRegExpFlags() {
00816   // Scan regular expression flags.
00817   StartLiteral();
00818   while (kIsIdentifierPart.get(c0_))
00819     AddCharAdvance();
00820   TerminateLiteral();
00821 
00822   next_.location.end_pos = source_pos() - 1;
00823   return true;
00824 }
00825 
00826 } }  // namespace v8::internal