00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044 #include "config.h"
00045
00046 #include "pcre_internal.h"
00047
00048 #include <string.h>
00049 #include "ASCIICType.h"
00050
00051
00052
00053 #define REQ_UNSET (-2)
00054 #define REQ_NONE (-1)
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066 #define BRASTACK_SIZE 200
00067
00068
00069
00070
00071
00072
00073 static const short escapes[] = {
00074 0, 0, 0, 0, 0, 0, 0, 0,
00075 0, 0, ':', ';', '<', '=', '>', '?',
00076 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0,
00077 0, 0, 0, 0, 0, 0, 0, 0,
00078 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W,
00079 0, 0, 0, '[', '\\', ']', '^', '_',
00080 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0,
00081 0, 0, 0, 0, 0, 0, '\n', 0,
00082 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w,
00083 0, 0, 0
00084 };
00085
00086
00087
00088
00089 enum ErrorCode {
00090 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
00091 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
00092 };
00093
00094
00095
00096
00097 static const char* errorText(ErrorCode code)
00098 {
00099 static const char errorTexts[] =
00100
00101 "\\ at end of pattern\0"
00102 "\\c at end of pattern\0"
00103 "character value in \\x{...} sequence is too large\0"
00104 "numbers out of order in {} quantifier\0"
00105
00106 "number too big in {} quantifier\0"
00107 "missing terminating ] for character class\0"
00108 "internal error: code overflow\0"
00109 "range out of order in character class\0"
00110 "nothing to repeat\0"
00111
00112 "unmatched parentheses\0"
00113 "internal error: unexpected repeat\0"
00114 "unrecognized character after (?\0"
00115 "failed to get memory\0"
00116 "missing )\0"
00117
00118 "reference to non-existent subpattern\0"
00119 "regular expression too large\0"
00120 "parentheses nested too deeply"
00121 ;
00122
00123 int i = code;
00124 const char* text = errorTexts;
00125 while (i > 1)
00126 i -= !*text++;
00127 return text;
00128 }
00129
00130
00131
00132
00133 struct CompileData {
00134 CompileData() {
00135 top_backref = 0;
00136 backrefMap = 0;
00137 req_varyopt = 0;
00138 needOuterBracket = false;
00139 numCapturingBrackets = 0;
00140 }
00141 int top_backref;
00142 unsigned backrefMap;
00143 int req_varyopt;
00144 bool needOuterBracket;
00145 int numCapturingBrackets;
00146 };
00147
00148
00149
00150 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
00151 static bool bracketIsAnchored(const unsigned char* code);
00152 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
00153 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177 static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)
00178 {
00179 const UChar* ptr = *ptrptr + 1;
00180
00181
00182 if (ptr == patternEnd) {
00183 *errorcodeptr = ERR1;
00184 *ptrptr = ptr;
00185 return 0;
00186 }
00187
00188 int c = *ptr;
00189
00190
00191
00192
00193
00194 if (c < '0' || c > 'z') {
00195 } else if (int escapeValue = escapes[c - '0']) {
00196 c = escapeValue;
00197 if (isclass) {
00198 if (-c == ESC_b)
00199 c = '\b';
00200 else if (-c == ESC_B)
00201 c = 'B';
00202 }
00203
00204
00205 } else {
00206 switch (c) {
00207 case '1':
00208 case '2':
00209 case '3':
00210 case '4':
00211 case '5':
00212 case '6':
00213 case '7':
00214 case '8':
00215 case '9':
00216
00217
00218
00219
00220
00221 if (!isclass) {
00222 const UChar* oldptr = ptr;
00223 c -= '0';
00224 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
00225 c = c * 10 + *(++ptr) - '0';
00226 if (c <= bracount) {
00227 c = -(ESC_REF + c);
00228 break;
00229 }
00230 ptr = oldptr;
00231 }
00232
00233
00234
00235
00236 if ((c = *ptr) >= '8')
00237 break;
00238
00239
00240
00241
00242 case '0': {
00243 c -= '0';
00244 int i;
00245 for (i = 1; i <= 2; ++i) {
00246 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
00247 break;
00248 int cc = c * 8 + ptr[i] - '0';
00249 if (cc > 255)
00250 break;
00251 c = cc;
00252 }
00253 ptr += i - 1;
00254 break;
00255 }
00256
00257 case 'x': {
00258 c = 0;
00259 int i;
00260 for (i = 1; i <= 2; ++i) {
00261 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
00262 c = 'x';
00263 i = 1;
00264 break;
00265 }
00266 int cc = ptr[i];
00267 if (cc >= 'a')
00268 cc -= 32;
00269 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
00270 }
00271 ptr += i - 1;
00272 break;
00273 }
00274
00275 case 'u': {
00276 c = 0;
00277 int i;
00278 for (i = 1; i <= 4; ++i) {
00279 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
00280 c = 'u';
00281 i = 1;
00282 break;
00283 }
00284 int cc = ptr[i];
00285 if (cc >= 'a')
00286 cc -= 32;
00287 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
00288 }
00289 ptr += i - 1;
00290 break;
00291 }
00292
00293 case 'c':
00294 if (++ptr == patternEnd) {
00295 *errorcodeptr = ERR2;
00296 return 0;
00297 }
00298 c = *ptr;
00299
00300
00301
00302 c = toASCIIUpper(c) ^ 0x40;
00303 break;
00304 }
00305 }
00306
00307 *ptrptr = ptr;
00308 return c;
00309 }
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
00327 {
00328 if (p >= patternEnd || !isASCIIDigit(*p))
00329 return false;
00330 p++;
00331 while (p < patternEnd && isASCIIDigit(*p))
00332 p++;
00333 if (p < patternEnd && *p == '}')
00334 return true;
00335
00336 if (p >= patternEnd || *p++ != ',')
00337 return false;
00338 if (p < patternEnd && *p == '}')
00339 return true;
00340
00341 if (p >= patternEnd || !isASCIIDigit(*p))
00342 return false;
00343 p++;
00344 while (p < patternEnd && isASCIIDigit(*p))
00345 p++;
00346
00347 return (p < patternEnd && *p == '}');
00348 }
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorcodeptr)
00370 {
00371 int min = 0;
00372 int max = -1;
00373
00374
00375
00376
00377 while (isASCIIDigit(*p))
00378 min = min * 10 + *p++ - '0';
00379 if (min < 0 || min > 65535) {
00380 *errorcodeptr = ERR5;
00381 return p;
00382 }
00383
00384
00385
00386
00387 if (*p == '}')
00388 max = min;
00389 else {
00390 if (*(++p) != '}') {
00391 max = 0;
00392 while (isASCIIDigit(*p))
00393 max = max * 10 + *p++ - '0';
00394 if (max < 0 || max > 65535) {
00395 *errorcodeptr = ERR5;
00396 return p;
00397 }
00398 if (max < min) {
00399 *errorcodeptr = ERR4;
00400 return p;
00401 }
00402 }
00403 }
00404
00405
00406
00407
00408 *minp = min;
00409 *maxp = max;
00410 return p;
00411 }
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426 static const unsigned char* firstSignificantOpcode(const unsigned char* code)
00427 {
00428 while (*code == OP_BRANUMBER)
00429 code += 3;
00430 return code;
00431 }
00432
00433 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
00434 {
00435 while (true) {
00436 switch (*code) {
00437 case OP_ASSERT_NOT:
00438 advanceToEndOfBracket(code);
00439 code += 1 + LINK_SIZE;
00440 break;
00441 case OP_WORD_BOUNDARY:
00442 case OP_NOT_WORD_BOUNDARY:
00443 ++code;
00444 break;
00445 case OP_BRANUMBER:
00446 code += 3;
00447 break;
00448 default:
00449 return code;
00450 }
00451 }
00452 }
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
00473 {
00474 int c, othercase = 0;
00475
00476 for (c = *cptr; c <= d; c++) {
00477 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0)
00478 break;
00479 }
00480
00481 if (c > d)
00482 return false;
00483
00484 *ocptr = othercase;
00485 int next = othercase + 1;
00486
00487 for (++c; c <= d; c++) {
00488 if (kjs_pcre_ucp_othercase(c) != next)
00489 break;
00490 next++;
00491 }
00492
00493 *odptr = next - 1;
00494 *cptr = c;
00495
00496 return true;
00497 }
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513 static int encodeUTF8(int cvalue, unsigned char *buffer)
00514 {
00515 int i;
00516 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
00517 if (cvalue <= kjs_pcre_utf8_table1[i])
00518 break;
00519 buffer += i;
00520 for (int j = i; j > 0; j--) {
00521 *buffer-- = 0x80 | (cvalue & 0x3f);
00522 cvalue >>= 6;
00523 }
00524 *buffer = kjs_pcre_utf8_table2[i] | cvalue;
00525 return i + 1;
00526 }
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
00549 {
00550 return ((ptr + 1 < patternEnd) && ptr[1] == expected);
00551 }
00552
00553 static bool
00554 compileBranch(int options, int* brackets, unsigned char** codeptr,
00555 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr,
00556 int* reqbyteptr, CompileData& cd)
00557 {
00558 int repeat_type, op_type;
00559 int repeat_min = 0, repeat_max = 0;
00560 int bravalue = 0;
00561 int reqvary, tempreqvary;
00562 int c;
00563 unsigned char* code = *codeptr;
00564 unsigned char* tempcode;
00565 bool groupsetfirstbyte = false;
00566 const UChar* ptr = *ptrptr;
00567 const UChar* tempptr;
00568 unsigned char* previous = NULL;
00569 unsigned char classbits[32];
00570
00571 bool class_utf8;
00572 unsigned char* class_utf8data;
00573 unsigned char utf8_char[6];
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585 int firstbyte = REQ_UNSET;
00586 int reqbyte = REQ_UNSET;
00587 int zeroreqbyte = REQ_UNSET;
00588 int zerofirstbyte = REQ_UNSET;
00589
00590
00591
00592
00593
00594
00595 int req_caseopt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
00596
00597
00598
00599 for (;; ptr++) {
00600 bool negate_class;
00601 bool should_flip_negation;
00602 int class_charcount;
00603 int class_lastchar;
00604 int skipbytes;
00605 int subreqbyte;
00606 int subfirstbyte;
00607 int mclength;
00608 unsigned char mcbuffer[8];
00609
00610
00611
00612 c = ptr < patternEnd ? *ptr : 0;
00613
00614
00615
00616
00617 bool is_quantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
00618
00619 switch (c) {
00620
00621
00622 case 0:
00623 if (ptr < patternEnd)
00624 goto NORMAL_CHAR;
00625
00626 case '|':
00627 case ')':
00628 *firstbyteptr = firstbyte;
00629 *reqbyteptr = reqbyte;
00630 *codeptr = code;
00631 *ptrptr = ptr;
00632 return true;
00633
00634
00635
00636
00637 case '^':
00638 if (options & MatchAcrossMultipleLinesOption) {
00639 if (firstbyte == REQ_UNSET)
00640 firstbyte = REQ_NONE;
00641 *code++ = OP_BOL;
00642 } else
00643 *code++ = OP_CIRC;
00644 previous = NULL;
00645 break;
00646
00647 case '$':
00648 previous = NULL;
00649 if (options & MatchAcrossMultipleLinesOption)
00650 *code++ = OP_EOL;
00651 else
00652 *code++ = OP_DOLL;
00653 break;
00654
00655
00656
00657
00658 case '.':
00659 if (firstbyte == REQ_UNSET)
00660 firstbyte = REQ_NONE;
00661 zerofirstbyte = firstbyte;
00662 zeroreqbyte = reqbyte;
00663 previous = code;
00664 *code++ = OP_NOT_NEWLINE;
00665 break;
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675
00676
00677
00678
00679 case '[': {
00680 previous = code;
00681 should_flip_negation = false;
00682
00683
00684
00685
00686
00687
00688 if (ptr + 1 >= patternEnd) {
00689 *errorcodeptr = ERR6;
00690 return false;
00691 }
00692
00693 if (ptr[1] == '^') {
00694 negate_class = true;
00695 ++ptr;
00696 } else
00697 negate_class = false;
00698
00699
00700
00701
00702
00703 class_charcount = 0;
00704 class_lastchar = -1;
00705
00706 class_utf8 = false;
00707 class_utf8data = code + LINK_SIZE + 34;
00708
00709
00710
00711
00712
00713
00714 memset(classbits, 0, 32 * sizeof(unsigned char));
00715
00716
00717
00718
00719
00720
00721 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
00722
00723
00724
00725
00726
00727
00728
00729
00730 if (c == '\\') {
00731 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, true);
00732 if (c < 0) {
00733 class_charcount += 2;
00734 switch (-c) {
00735 case ESC_d:
00736 for (c = 0; c < 32; c++)
00737 classbits[c] |= classBitmapForChar(c + cbit_digit);
00738 continue;
00739
00740 case ESC_D:
00741 should_flip_negation = true;
00742 for (c = 0; c < 32; c++)
00743 classbits[c] |= ~classBitmapForChar(c + cbit_digit);
00744 continue;
00745
00746 case ESC_w:
00747 for (c = 0; c < 32; c++)
00748 classbits[c] |= classBitmapForChar(c + cbit_word);
00749 continue;
00750
00751 case ESC_W:
00752 should_flip_negation = true;
00753 for (c = 0; c < 32; c++)
00754 classbits[c] |= ~classBitmapForChar(c + cbit_word);
00755 continue;
00756
00757 case ESC_s:
00758 for (c = 0; c < 32; c++)
00759 classbits[c] |= classBitmapForChar(c + cbit_space);
00760 continue;
00761
00762 case ESC_S:
00763 should_flip_negation = true;
00764 for (c = 0; c < 32; c++)
00765 classbits[c] |= ~classBitmapForChar(c + cbit_space);
00766 continue;
00767
00768
00769
00770
00771
00772 default:
00773 c = *ptr;
00774 class_charcount -= 2;
00775 }
00776 }
00777
00778
00779
00780
00781 }
00782
00783
00784
00785
00786
00787 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
00788 ptr += 2;
00789
00790 int d = *ptr;
00791
00792
00793
00794
00795
00796 if (d == '\\') {
00797 const UChar* oldptr = ptr;
00798 d = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, true);
00799
00800
00801 if (d < 0) {
00802 ptr = oldptr - 2;
00803 goto LONE_SINGLE_CHARACTER;
00804 }
00805 }
00806
00807
00808
00809
00810 if (d == c)
00811 goto LONE_SINGLE_CHARACTER;
00812
00813
00814
00815
00816
00817
00818 if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
00819 class_utf8 = true;
00820
00821
00822
00823
00824
00825 if (options & IgnoreCaseOption) {
00826 int occ, ocd;
00827 int cc = c;
00828 int origd = d;
00829 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
00830 if (occ >= c && ocd <= d)
00831 continue;
00832
00833 if (occ < c && ocd >= c - 1)
00834 {
00835 c = occ;
00836 continue;
00837 }
00838 if (ocd > d && occ <= d + 1)
00839 {
00840 d = ocd;
00841 continue;
00842 }
00843
00844 if (occ == ocd)
00845 *class_utf8data++ = XCL_SINGLE;
00846 else {
00847 *class_utf8data++ = XCL_RANGE;
00848 class_utf8data += encodeUTF8(occ, class_utf8data);
00849 }
00850 class_utf8data += encodeUTF8(ocd, class_utf8data);
00851 }
00852 }
00853
00854
00855
00856
00857 *class_utf8data++ = XCL_RANGE;
00858 class_utf8data += encodeUTF8(c, class_utf8data);
00859 class_utf8data += encodeUTF8(d, class_utf8data);
00860
00861
00862
00863
00864
00865 continue;
00866 }
00867
00868
00869
00870
00871
00872 for (; c <= d; c++) {
00873 classbits[c/8] |= (1 << (c&7));
00874 if (options & IgnoreCaseOption) {
00875 int uc = flipCase(c);
00876 classbits[uc/8] |= (1 << (uc&7));
00877 }
00878 class_charcount++;
00879 class_lastchar = c;
00880 }
00881
00882 continue;
00883 }
00884
00885
00886
00887
00888
00889 LONE_SINGLE_CHARACTER:
00890
00891
00892
00893 if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
00894 class_utf8 = true;
00895 *class_utf8data++ = XCL_SINGLE;
00896 class_utf8data += encodeUTF8(c, class_utf8data);
00897
00898 if (options & IgnoreCaseOption) {
00899 int othercase;
00900 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) {
00901 *class_utf8data++ = XCL_SINGLE;
00902 class_utf8data += encodeUTF8(othercase, class_utf8data);
00903 }
00904 }
00905 } else {
00906
00907 classbits[c/8] |= (1 << (c&7));
00908 if (options & IgnoreCaseOption) {
00909 c = flipCase(c);
00910 classbits[c/8] |= (1 << (c&7));
00911 }
00912 class_charcount++;
00913 class_lastchar = c;
00914 }
00915 }
00916
00917
00918
00919
00920
00921
00922
00923
00924
00925
00926
00927
00928
00929
00930
00931 if (class_charcount == 1 && (!class_utf8 && (!negate_class || class_lastchar < 128))) {
00932 zeroreqbyte = reqbyte;
00933
00934
00935
00936 if (negate_class) {
00937 if (firstbyte == REQ_UNSET)
00938 firstbyte = REQ_NONE;
00939 zerofirstbyte = firstbyte;
00940 *code++ = OP_NOT;
00941 *code++ = class_lastchar;
00942 break;
00943 }
00944
00945
00946
00947
00948 c = class_lastchar;
00949 goto NORMAL_CHAR;
00950 }
00951
00952
00953
00954
00955
00956
00957 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
00958 zerofirstbyte = firstbyte;
00959 zeroreqbyte = reqbyte;
00960
00961
00962
00963
00964
00965 if (class_utf8 && !should_flip_negation) {
00966 *class_utf8data++ = XCL_END;
00967 *code++ = OP_XCLASS;
00968 code += LINK_SIZE;
00969 *code = negate_class? XCL_NOT : 0;
00970
00971
00972
00973
00974 if (class_charcount > 0) {
00975 *code++ |= XCL_MAP;
00976 memcpy(code, classbits, 32);
00977 code = class_utf8data;
00978 }
00979
00980
00981
00982 else {
00983 int len = class_utf8data - (code + 33);
00984 memmove(code + 1, code + 33, len);
00985 code += len + 1;
00986 }
00987
00988
00989
00990 putLinkValue(previous + 1, code - previous);
00991 break;
00992 }
00993
00994
00995
00996
00997
00998
00999 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
01000 if (negate_class)
01001 for (c = 0; c < 32; c++)
01002 code[c] = ~classbits[c];
01003 else
01004 memcpy(code, classbits, 32);
01005 code += 32;
01006 break;
01007 }
01008
01009
01010
01011
01012 case '{':
01013 if (!is_quantifier)
01014 goto NORMAL_CHAR;
01015 ptr = readRepeatCounts(ptr + 1, &repeat_min, &repeat_max, errorcodeptr);
01016 if (*errorcodeptr)
01017 goto FAILED;
01018 goto REPEAT;
01019
01020 case '*':
01021 repeat_min = 0;
01022 repeat_max = -1;
01023 goto REPEAT;
01024
01025 case '+':
01026 repeat_min = 1;
01027 repeat_max = -1;
01028 goto REPEAT;
01029
01030 case '?':
01031 repeat_min = 0;
01032 repeat_max = 1;
01033
01034 REPEAT:
01035 if (!previous) {
01036 *errorcodeptr = ERR9;
01037 goto FAILED;
01038 }
01039
01040 if (repeat_min == 0) {
01041 firstbyte = zerofirstbyte;
01042 reqbyte = zeroreqbyte;
01043 }
01044
01045
01046
01047 reqvary = (repeat_min == repeat_max) ? 0 : REQ_VARY;
01048
01049 op_type = 0;
01050
01051
01052
01053
01054
01055 tempcode = previous;
01056
01057
01058
01059
01060
01061
01062
01063 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
01064 repeat_type = 1;
01065 ptr++;
01066 } else
01067 repeat_type = 0;
01068
01069
01070
01071
01072
01073
01074
01075 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
01076
01077
01078
01079
01080
01081 if (code[-1] & 0x80) {
01082 unsigned char *lastchar = code - 1;
01083 while((*lastchar & 0xc0) == 0x80)
01084 lastchar--;
01085 c = code - lastchar;
01086 memcpy(utf8_char, lastchar, c);
01087 c |= 0x80;
01088 }
01089 else {
01090 c = code[-1];
01091 if (repeat_min > 1)
01092 reqbyte = c | req_caseopt | cd.req_varyopt;
01093 }
01094
01095 goto OUTPUT_SINGLE_REPEAT;
01096 }
01097
01098 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
01099 c = previous[1];
01100 if (repeat_min > 1)
01101 reqbyte = c | req_caseopt | cd.req_varyopt;
01102 goto OUTPUT_SINGLE_REPEAT;
01103 }
01104
01105
01106
01107
01108
01109
01110 else if (*previous == OP_NOT) {
01111 op_type = OP_NOTSTAR - OP_STAR;
01112 c = previous[1];
01113 goto OUTPUT_SINGLE_REPEAT;
01114 }
01115
01116
01117
01118
01119
01120 else if (*previous <= OP_NOT_NEWLINE) {
01121 op_type = OP_TYPESTAR - OP_STAR;
01122 c = *previous;
01123
01124 OUTPUT_SINGLE_REPEAT:
01125 int prop_type = -1;
01126 int prop_value = -1;
01127
01128 unsigned char* oldcode = code;
01129 code = previous;
01130
01131
01132
01133
01134 if (repeat_max == 0)
01135 goto END_REPEAT;
01136
01137
01138
01139 repeat_type += op_type;
01140
01141
01142
01143
01144 if (repeat_min == 0) {
01145 if (repeat_max == -1)
01146 *code++ = OP_STAR + repeat_type;
01147 else if (repeat_max == 1)
01148 *code++ = OP_QUERY + repeat_type;
01149 else {
01150 *code++ = OP_UPTO + repeat_type;
01151 put2ByteValueAndAdvance(code, repeat_max);
01152 }
01153 }
01154
01155
01156
01157
01158
01159
01160 else if (repeat_min == 1) {
01161 if (repeat_max == -1)
01162 *code++ = OP_PLUS + repeat_type;
01163 else {
01164 code = oldcode;
01165 if (repeat_max == 1)
01166 goto END_REPEAT;
01167 *code++ = OP_UPTO + repeat_type;
01168 put2ByteValueAndAdvance(code, repeat_max - 1);
01169 }
01170 }
01171
01172
01173
01174
01175 else {
01176 *code++ = OP_EXACT + op_type;
01177 put2ByteValueAndAdvance(code, repeat_min);
01178
01179
01180
01181
01182
01183
01184
01185 if (repeat_max < 0) {
01186 if (c >= 128) {
01187 memcpy(code, utf8_char, c & 7);
01188 code += c & 7;
01189 } else {
01190 *code++ = c;
01191 if (prop_type >= 0) {
01192 *code++ = prop_type;
01193 *code++ = prop_value;
01194 }
01195 }
01196 *code++ = OP_STAR + repeat_type;
01197 }
01198
01199
01200
01201
01202 else if (repeat_max != repeat_min) {
01203 if (c >= 128) {
01204 memcpy(code, utf8_char, c & 7);
01205 code += c & 7;
01206 } else
01207 *code++ = c;
01208 if (prop_type >= 0) {
01209 *code++ = prop_type;
01210 *code++ = prop_value;
01211 }
01212 repeat_max -= repeat_min;
01213 *code++ = OP_UPTO + repeat_type;
01214 put2ByteValueAndAdvance(code, repeat_max);
01215 }
01216 }
01217
01218
01219
01220 if (c >= 128) {
01221 memcpy(code, utf8_char, c & 7);
01222 code += c & 7;
01223 } else
01224 *code++ = c;
01225
01226
01227
01228
01229 if (prop_type >= 0) {
01230 *code++ = prop_type;
01231 *code++ = prop_value;
01232 }
01233 }
01234
01235
01236
01237
01238 else if (*previous == OP_CLASS ||
01239 *previous == OP_NCLASS ||
01240 *previous == OP_XCLASS ||
01241 *previous == OP_REF)
01242 {
01243 if (repeat_max == 0) {
01244 code = previous;
01245 goto END_REPEAT;
01246 }
01247
01248 if (repeat_min == 0 && repeat_max == -1)
01249 *code++ = OP_CRSTAR + repeat_type;
01250 else if (repeat_min == 1 && repeat_max == -1)
01251 *code++ = OP_CRPLUS + repeat_type;
01252 else if (repeat_min == 0 && repeat_max == 1)
01253 *code++ = OP_CRQUERY + repeat_type;
01254 else {
01255 *code++ = OP_CRRANGE + repeat_type;
01256 put2ByteValueAndAdvance(code, repeat_min);
01257 if (repeat_max == -1)
01258 repeat_max = 0;
01259 put2ByteValueAndAdvance(code, repeat_max);
01260 }
01261 }
01262
01263
01264
01265
01266 else if (*previous >= OP_BRA) {
01267 int ketoffset = 0;
01268 int len = code - previous;
01269 unsigned char* bralink = NULL;
01270
01271
01272
01273
01274
01275
01276
01277 if (repeat_max == -1) {
01278 const unsigned char* ket = previous;
01279 advanceToEndOfBracket(ket);
01280 ketoffset = code - ket;
01281 }
01282
01283
01284
01285
01286
01287
01288
01289
01290 if (repeat_min == 0) {
01291
01292
01293
01294 if (repeat_max == 0) {
01295 code = previous;
01296 goto END_REPEAT;
01297 }
01298
01299
01300
01301
01302
01303
01304
01305 if (repeat_max <= 1) {
01306 *code = OP_END;
01307 memmove(previous+1, previous, len);
01308 code++;
01309 *previous++ = OP_BRAZERO + repeat_type;
01310 }
01311
01312
01313
01314
01315
01316
01317
01318
01319 else {
01320 *code = OP_END;
01321 memmove(previous + 2 + LINK_SIZE, previous, len);
01322 code += 2 + LINK_SIZE;
01323 *previous++ = OP_BRAZERO + repeat_type;
01324 *previous++ = OP_BRA;
01325
01326
01327
01328
01329 int offset = (!bralink) ? 0 : previous - bralink;
01330 bralink = previous;
01331 putLinkValueAllowZeroAndAdvance(previous, offset);
01332 }
01333
01334 repeat_max--;
01335 }
01336
01337
01338
01339
01340
01341
01342 else {
01343 if (repeat_min > 1) {
01344 if (groupsetfirstbyte && reqbyte < 0)
01345 reqbyte = firstbyte;
01346 for (int i = 1; i < repeat_min; i++) {
01347 memcpy(code, previous, len);
01348 code += len;
01349 }
01350 }
01351 if (repeat_max > 0)
01352 repeat_max -= repeat_min;
01353 }
01354
01355
01356
01357
01358
01359
01360
01361 if (repeat_max >= 0) {
01362 for (int i = repeat_max - 1; i >= 0; i--) {
01363 *code++ = OP_BRAZERO + repeat_type;
01364
01365
01366
01367
01368 if (i != 0) {
01369 *code++ = OP_BRA;
01370 int offset = (!bralink) ? 0 : code - bralink;
01371 bralink = code;
01372 putLinkValueAllowZeroAndAdvance(code, offset);
01373 }
01374
01375 memcpy(code, previous, len);
01376 code += len;
01377 }
01378
01379
01380
01381
01382 while (bralink) {
01383 int offset = code - bralink + 1;
01384 unsigned char* bra = code - offset;
01385 int oldlinkoffset = getLinkValueAllowZero(bra + 1);
01386 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
01387 *code++ = OP_KET;
01388 putLinkValueAndAdvance(code, offset);
01389 putLinkValue(bra + 1, offset);
01390 }
01391 }
01392
01393
01394
01395
01396
01397
01398 else
01399 code[-ketoffset] = OP_KETRMAX + repeat_type;
01400 }
01401
01402
01403
01404 else {
01405 *errorcodeptr = ERR11;
01406 goto FAILED;
01407 }
01408
01409
01410
01411
01412
01413 END_REPEAT:
01414 previous = NULL;
01415 cd.req_varyopt |= reqvary;
01416 break;
01417
01418
01419
01420
01421
01422
01423
01424
01425 case '(':
01426 skipbytes = 0;
01427
01428 if (*(++ptr) == '?') {
01429 switch (*(++ptr)) {
01430 case ':':
01431 bravalue = OP_BRA;
01432 ptr++;
01433 break;
01434
01435 case '=':
01436 bravalue = OP_ASSERT;
01437 ptr++;
01438 break;
01439
01440 case '!':
01441 bravalue = OP_ASSERT_NOT;
01442 ptr++;
01443 break;
01444
01445
01446
01447 default:
01448 *errorcodeptr = ERR12;
01449 goto FAILED;
01450 }
01451 }
01452
01453
01454
01455
01456
01457 else {
01458 if (++(*brackets) > EXTRACT_BASIC_MAX) {
01459 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
01460 code[1 + LINK_SIZE] = OP_BRANUMBER;
01461 put2ByteValue(code + 2 + LINK_SIZE, *brackets);
01462 skipbytes = 3;
01463 }
01464 else
01465 bravalue = OP_BRA + *brackets;
01466 }
01467
01468
01469
01470
01471
01472
01473 previous = (bravalue >= OP_BRAZERO) ? code : 0;
01474 *code = bravalue;
01475 tempcode = code;
01476 tempreqvary = cd.req_varyopt;
01477
01478 if (!compileBracket(
01479 options,
01480 brackets,
01481 &tempcode,
01482 &ptr,
01483 patternEnd,
01484 errorcodeptr,
01485 skipbytes,
01486 &subfirstbyte,
01487 &subreqbyte,
01488 cd))
01489 goto FAILED;
01490
01491
01492
01493
01494
01495
01496
01497
01498
01499
01500
01501
01502 zeroreqbyte = reqbyte;
01503 zerofirstbyte = firstbyte;
01504 groupsetfirstbyte = false;
01505
01506 if (bravalue >= OP_BRA) {
01507
01508
01509
01510
01511
01512
01513 if (firstbyte == REQ_UNSET) {
01514 if (subfirstbyte >= 0) {
01515 firstbyte = subfirstbyte;
01516 groupsetfirstbyte = true;
01517 }
01518 else
01519 firstbyte = REQ_NONE;
01520 zerofirstbyte = REQ_NONE;
01521 }
01522
01523
01524
01525
01526
01527 else if (subfirstbyte >= 0 && subreqbyte < 0)
01528 subreqbyte = subfirstbyte | tempreqvary;
01529
01530
01531
01532
01533 if (subreqbyte >= 0)
01534 reqbyte = subreqbyte;
01535 }
01536
01537
01538
01539
01540
01541
01542
01543
01544
01545 else if (bravalue == OP_ASSERT && subreqbyte >= 0)
01546 reqbyte = subreqbyte;
01547
01548
01549
01550 code = tempcode;
01551
01552
01553
01554 if (ptr >= patternEnd || *ptr != ')') {
01555 *errorcodeptr = ERR14;
01556 goto FAILED;
01557 }
01558 break;
01559
01560
01561
01562
01563
01564 case '\\':
01565 tempptr = ptr;
01566 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, false);
01567
01568
01569
01570
01571
01572
01573
01574
01575 if (c < 0) {
01576
01577
01578
01579 if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
01580 firstbyte = REQ_NONE;
01581
01582
01583
01584 zerofirstbyte = firstbyte;
01585 zeroreqbyte = reqbyte;
01586
01587
01588
01589 if (-c >= ESC_REF) {
01590 int number = -c - ESC_REF;
01591 previous = code;
01592 *code++ = OP_REF;
01593 put2ByteValueAndAdvance(code, number);
01594 }
01595
01596
01597
01598
01599 else {
01600 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
01601 *code++ = -c;
01602 }
01603 continue;
01604 }
01605
01606
01607
01608
01609
01610
01611
01612 default:
01613 NORMAL_CHAR:
01614
01615 previous = code;
01616
01617 if (c < 128) {
01618 mclength = 1;
01619 mcbuffer[0] = c;
01620
01621 if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
01622 *code++ = OP_ASCII_LETTER_IGNORING_CASE;
01623 *code++ = c | 0x20;
01624 } else {
01625 *code++ = OP_ASCII_CHAR;
01626 *code++ = c;
01627 }
01628 } else {
01629 mclength = encodeUTF8(c, mcbuffer);
01630
01631 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
01632 for (c = 0; c < mclength; c++)
01633 *code++ = mcbuffer[c];
01634 }
01635
01636
01637
01638
01639
01640
01641 if (firstbyte == REQ_UNSET) {
01642 zerofirstbyte = REQ_NONE;
01643 zeroreqbyte = reqbyte;
01644
01645
01646
01647
01648 if (mclength == 1 || req_caseopt == 0) {
01649 firstbyte = mcbuffer[0] | req_caseopt;
01650 if (mclength != 1)
01651 reqbyte = code[-1] | cd.req_varyopt;
01652 }
01653 else
01654 firstbyte = reqbyte = REQ_NONE;
01655 }
01656
01657
01658
01659
01660 else {
01661 zerofirstbyte = firstbyte;
01662 zeroreqbyte = reqbyte;
01663 if (mclength == 1 || req_caseopt == 0)
01664 reqbyte = code[-1] | req_caseopt | cd.req_varyopt;
01665 }
01666
01667 break;
01668 }
01669 }
01670
01671
01672
01673
01674
01675 FAILED:
01676 *ptrptr = ptr;
01677 return false;
01678 }
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695
01696
01697
01698
01699
01700
01701
01702
01703
01704
01705
01706 static bool
01707 compileBracket(int options, int* brackets, unsigned char** codeptr,
01708 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int skipbytes,
01709 int* firstbyteptr, int* reqbyteptr, CompileData& cd)
01710 {
01711 const UChar* ptr = *ptrptr;
01712 unsigned char* code = *codeptr;
01713 unsigned char* last_branch = code;
01714 unsigned char* start_bracket = code;
01715 int firstbyte = REQ_UNSET;
01716 int reqbyte = REQ_UNSET;
01717
01718
01719
01720 putLinkValueAllowZero(code + 1, 0);
01721 code += 1 + LINK_SIZE + skipbytes;
01722
01723
01724
01725 while (true) {
01726
01727
01728 int branchfirstbyte;
01729 int branchreqbyte;
01730 if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorcodeptr,
01731 &branchfirstbyte, &branchreqbyte, cd)) {
01732 *ptrptr = ptr;
01733 return false;
01734 }
01735
01736
01737
01738
01739 if (*last_branch != OP_ALT) {
01740 firstbyte = branchfirstbyte;
01741 reqbyte = branchreqbyte;
01742 }
01743
01744
01745
01746
01747
01748
01749 else {
01750
01751
01752
01753
01754 if (firstbyte >= 0 && firstbyte != branchfirstbyte) {
01755 if (reqbyte < 0)
01756 reqbyte = firstbyte;
01757 firstbyte = REQ_NONE;
01758 }
01759
01760
01761
01762
01763 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
01764 branchreqbyte = branchfirstbyte;
01765
01766
01767
01768 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
01769 reqbyte = REQ_NONE;
01770 else
01771 reqbyte |= branchreqbyte;
01772 }
01773
01774
01775
01776
01777
01778
01779
01780
01781
01782
01783 if (ptr >= patternEnd || *ptr != '|') {
01784 int length = code - last_branch;
01785 do {
01786 int prev_length = getLinkValueAllowZero(last_branch + 1);
01787 putLinkValue(last_branch + 1, length);
01788 length = prev_length;
01789 last_branch -= length;
01790 } while (length > 0);
01791
01792
01793
01794 *code = OP_KET;
01795 putLinkValue(code + 1, code - start_bracket);
01796 code += 1 + LINK_SIZE;
01797
01798
01799
01800 *codeptr = code;
01801 *ptrptr = ptr;
01802 *firstbyteptr = firstbyte;
01803 *reqbyteptr = reqbyte;
01804 return true;
01805 }
01806
01807
01808
01809
01810
01811
01812 *code = OP_ALT;
01813 putLinkValue(code + 1, code - last_branch);
01814 last_branch = code;
01815 code += 1 + LINK_SIZE;
01816 ptr++;
01817 }
01818 ASSERT_NOT_REACHED();
01819 }
01820
01821
01822
01823
01824
01825
01826
01827
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837
01838 static bool branchIsAnchored(const unsigned char* code)
01839 {
01840 const unsigned char* scode = firstSignificantOpcode(code);
01841 int op = *scode;
01842
01843
01844 if (op >= OP_BRA || op == OP_ASSERT)
01845 return bracketIsAnchored(scode);
01846
01847
01848 return op == OP_CIRC;
01849 }
01850
01851 static bool bracketIsAnchored(const unsigned char* code)
01852 {
01853 do {
01854 if (!branchIsAnchored(code + 1 + LINK_SIZE))
01855 return false;
01856 code += getLinkValue(code + 1);
01857 } while (*code == OP_ALT);
01858 return true;
01859 }
01860
01861
01862
01863
01864
01865
01866
01867
01868
01869
01870
01871
01872
01873
01874
01875
01876
01877
01878
01879
01880
01881
01882
01883 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
01884 {
01885 const unsigned char* scode = firstSignificantOpcode(code);
01886 int op = *scode;
01887
01888
01889 if (op > OP_BRA) {
01890 int captureNum = op - OP_BRA;
01891 if (captureNum > EXTRACT_BASIC_MAX)
01892 captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
01893 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
01894 return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
01895 }
01896
01897
01898 if (op == OP_BRA || op == OP_ASSERT)
01899 return bracketNeedsLineStart(scode, captureMap, backrefMap);
01900
01901
01902
01903
01904 if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
01905 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
01906
01907
01908 return op == OP_CIRC || op == OP_BOL;
01909 }
01910
01911 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
01912 {
01913 do {
01914 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
01915 return false;
01916 code += getLinkValue(code + 1);
01917 } while (*code == OP_ALT);
01918 return true;
01919 }
01920
01921
01922
01923
01924
01925
01926
01927
01928
01929
01930
01931
01932
01933
01934
01935
01936
01937
01938
01939
01940
01941 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
01942 {
01943 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
01944 int op = *scode;
01945
01946 if (op >= OP_BRA)
01947 op = OP_BRA;
01948
01949 switch (op) {
01950 default:
01951 return -1;
01952
01953 case OP_BRA:
01954 case OP_ASSERT:
01955 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
01956
01957 case OP_EXACT:
01958 scode += 2;
01959
01960
01961 case OP_CHAR:
01962 case OP_CHAR_IGNORING_CASE:
01963 case OP_ASCII_CHAR:
01964 case OP_ASCII_LETTER_IGNORING_CASE:
01965 case OP_PLUS:
01966 case OP_MINPLUS:
01967 if (!inassert)
01968 return -1;
01969 return scode[1];
01970 }
01971 }
01972
01973 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
01974 {
01975 int c = -1;
01976 do {
01977 int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
01978 if (d < 0)
01979 return -1;
01980 if (c < 0)
01981 c = d;
01982 else if (c != d)
01983 return -1;
01984 code += getLinkValue(code + 1);
01985 } while (*code == OP_ALT);
01986 return c;
01987 }
01988
01989 static inline int multiplyWithOverflowCheck(int a, int b)
01990 {
01991 if (!a || !b)
01992 return 0;
01993 if (a > MAX_PATTERN_SIZE / b)
01994 return -1;
01995 return a * b;
01996 }
01997
01998 static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
01999 CompileData& cd, ErrorCode& errorcode)
02000 {
02001
02002
02003
02004
02005 if (patternLength > MAX_PATTERN_SIZE) {
02006 errorcode = ERR16;
02007 return -1;
02008 }
02009
02010 int length = 1 + LINK_SIZE;
02011 int branch_extra = 0;
02012 int lastitemlength = 0;
02013 unsigned brastackptr = 0;
02014 int brastack[BRASTACK_SIZE];
02015 unsigned char bralenstack[BRASTACK_SIZE];
02016 int bracount = 0;
02017
02018 const UChar* ptr = (const UChar*)(pattern - 1);
02019 const UChar* patternEnd = (const UChar*)(pattern + patternLength);
02020
02021 while (++ptr < patternEnd) {
02022 int minRepeats = 0, maxRepeats = 0;
02023 int c = *ptr;
02024
02025 switch (c) {
02026
02027
02028
02029 case '\\':
02030 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
02031 if (errorcode != 0)
02032 return -1;
02033
02034 lastitemlength = 1;
02035
02036 if (c >= 0) {
02037 length += 2;
02038
02039 if (c > 127) {
02040 int i;
02041 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
02042 if (c <= kjs_pcre_utf8_table1[i]) break;
02043 length += i;
02044 lastitemlength += i;
02045 }
02046
02047 continue;
02048 }
02049
02050
02051
02052 length++;
02053
02054
02055
02056
02057
02058 if (c <= -ESC_REF) {
02059 int refnum = -c - ESC_REF;
02060 cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
02061 if (refnum > cd.top_backref)
02062 cd.top_backref = refnum;
02063 length += 2;
02064 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
02065 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
02066 if (errorcode)
02067 return -1;
02068 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
02069 (minRepeats == 1 && maxRepeats == -1))
02070 length++;
02071 else
02072 length += 5;
02073 if (safelyCheckNextChar(ptr, patternEnd, '?'))
02074 ptr++;
02075 }
02076 }
02077 continue;
02078
02079 case '^':
02080 case '.':
02081 case '$':
02082 length++;
02083 lastitemlength = 1;
02084 continue;
02085
02086 case '*':
02087 case '+':
02088 case '?':
02089 length++;
02090 goto POSSESSIVE;
02091
02092
02093
02094
02095 case '{':
02096 if (!isCountedRepeat(ptr + 1, patternEnd))
02097 goto NORMAL_CHAR;
02098 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
02099 if (errorcode != 0)
02100 return -1;
02101
02102
02103
02104 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
02105 (minRepeats == 1 && maxRepeats == -1))
02106 length++;
02107
02108
02109
02110 else {
02111 if (minRepeats != 1) {
02112 length -= lastitemlength;
02113 if (minRepeats > 0)
02114 length += 3 + lastitemlength;
02115 }
02116 length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
02117 }
02118
02119 if (safelyCheckNextChar(ptr, patternEnd, '?'))
02120 ptr++;
02121
02122 POSSESSIVE:
02123 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
02124 ptr++;
02125 length += 2 + 2 * LINK_SIZE;
02126 }
02127 continue;
02128
02129
02130
02131
02132
02133
02134 case '|':
02135 if (brastackptr == 0)
02136 cd.needOuterBracket = true;
02137 length += 1 + LINK_SIZE + branch_extra;
02138 continue;
02139
02140
02141
02142
02143
02144
02145
02146
02147
02148 case '[': {
02149 int class_optcount;
02150 if (*(++ptr) == '^') {
02151 class_optcount = 10;
02152 ptr++;
02153 }
02154 else
02155 class_optcount = 0;
02156
02157 bool class_utf8 = false;
02158
02159 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
02160
02161
02162 if (*ptr == '\\') {
02163 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
02164 if (errorcode != 0)
02165 return -1;
02166
02167
02168
02169 if (c >= 0)
02170 goto NON_SPECIAL_CHARACTER;
02171
02172
02173
02174
02175 else
02176 class_optcount = 10;
02177 }
02178
02179
02180
02181
02182
02183
02184
02185 else {
02186 c = *ptr;
02187
02188
02189
02190 NON_SPECIAL_CHARACTER:
02191 class_optcount++;
02192
02193 int d = -1;
02194 if (safelyCheckNextChar(ptr, patternEnd, '-')) {
02195 UChar const *hyptr = ptr++;
02196 if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
02197 ptr++;
02198 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
02199 if (errorcode != 0)
02200 return -1;
02201 }
02202 else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
02203 d = *++ptr;
02204 if (d < 0)
02205 ptr = hyptr;
02206 }
02207
02208
02209
02210
02211 if (d >= 0) {
02212 class_optcount = 10;
02213 if (d < c) {
02214 errorcode = ERR8;
02215 return -1;
02216 }
02217
02218 if ((d > 255 || (ignoreCase && d > 127))) {
02219 unsigned char buffer[6];
02220 if (!class_utf8)
02221 {
02222 class_utf8 = true;
02223 length += LINK_SIZE + 2;
02224 }
02225
02226
02227
02228
02229
02230
02231
02232 if (ignoreCase) {
02233 int occ, ocd;
02234 int cc = c;
02235 int origd = d;
02236 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
02237 if (occ >= c && ocd <= d)
02238 continue;
02239
02240 if (occ < c && ocd >= c - 1)
02241 {
02242 c = occ;
02243 continue;
02244 }
02245 if (ocd > d && occ <= d + 1)
02246 {
02247 d = ocd;
02248 continue;
02249 }
02250
02251
02252
02253 length += 1 + encodeUTF8(occ, buffer) +
02254 ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
02255 }
02256 }
02257
02258
02259
02260 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
02261 }
02262
02263 }
02264
02265
02266
02267
02268
02269
02270 else {
02271 if ((c > 255 || (ignoreCase && c > 127))) {
02272 unsigned char buffer[6];
02273 class_optcount = 10;
02274 if (!class_utf8)
02275 {
02276 class_utf8 = true;
02277 length += LINK_SIZE + 2;
02278 }
02279 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
02280 }
02281 }
02282 }
02283 }
02284
02285 if (ptr >= patternEnd) {
02286 errorcode = ERR6;
02287 return -1;
02288 }
02289
02290
02291
02292
02293
02294
02295 if (class_optcount == 1)
02296 goto NORMAL_CHAR;
02297
02298
02299 {
02300 length += 33;
02301
02302
02303
02304
02305 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
02306 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
02307 if (errorcode != 0)
02308 return -1;
02309 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
02310 (minRepeats == 1 && maxRepeats == -1))
02311 length++;
02312 else
02313 length += 5;
02314 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
02315 ptr++;
02316 length += 2 + 2 * LINK_SIZE;
02317 } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
02318 ptr++;
02319 }
02320 }
02321 continue;
02322 }
02323
02324
02325
02326 case '(': {
02327 int branch_newextra = 0;
02328 int bracket_length = 1 + LINK_SIZE;
02329 bool capturing = false;
02330
02331
02332
02333 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
02334 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
02335
02336
02337
02338
02339
02340 case ':':
02341 case '=':
02342 case '!':
02343 ptr += 2;
02344 break;
02345
02346
02347
02348
02349
02350
02351 default:
02352 errorcode = ERR12;
02353 return -1;
02354 }
02355 } else
02356 capturing = 1;
02357
02358
02359
02360
02361
02362 if (capturing) {
02363 bracount++;
02364 if (bracount > EXTRACT_BASIC_MAX)
02365 bracket_length += 3;
02366 }
02367
02368
02369
02370
02371
02372
02373 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
02374 errorcode = ERR17;
02375 return -1;
02376 }
02377
02378 bralenstack[brastackptr] = branch_extra;
02379 branch_extra = branch_newextra;
02380
02381 brastack[brastackptr++] = length;
02382 length += bracket_length;
02383 continue;
02384 }
02385
02386
02387
02388
02389
02390
02391
02392 case ')': {
02393 int duplength;
02394 length += 1 + LINK_SIZE;
02395 if (brastackptr > 0) {
02396 duplength = length - brastack[--brastackptr];
02397 branch_extra = bralenstack[brastackptr];
02398 }
02399 else
02400 duplength = 0;
02401
02402
02403
02404
02405 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
02406 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
02407 if (errorcode)
02408 return -1;
02409 } else if (c == '*') {
02410 minRepeats = 0;
02411 maxRepeats = -1;
02412 ptr++;
02413 } else if (c == '+') {
02414 minRepeats = 1;
02415 maxRepeats = -1;
02416 ptr++;
02417 } else if (c == '?') {
02418 minRepeats = 0;
02419 maxRepeats = 1;
02420 ptr++;
02421 } else {
02422 minRepeats = 1;
02423 maxRepeats = 1;
02424 }
02425
02426
02427
02428
02429
02430
02431 int repeatsLength;
02432 if (minRepeats == 0) {
02433 length++;
02434 if (maxRepeats > 0) {
02435 repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + 3 + 2 * LINK_SIZE);
02436 if (repeatsLength < 0) {
02437 errorcode = ERR16;
02438 return -1;
02439 }
02440 length += repeatsLength;
02441 if (length > MAX_PATTERN_SIZE) {
02442 errorcode = ERR16;
02443 return -1;
02444 }
02445 }
02446 }
02447
02448
02449
02450
02451
02452
02453
02454 else {
02455 repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
02456 if (repeatsLength < 0) {
02457 errorcode = ERR16;
02458 return -1;
02459 }
02460 length += repeatsLength;
02461 if (maxRepeats > minRepeats) {
02462 repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + 3 + 2 * LINK_SIZE);
02463 if (repeatsLength < 0) {
02464 errorcode = ERR16;
02465 return -1;
02466 }
02467 length += repeatsLength - (2 + 2 * LINK_SIZE);
02468 }
02469 if (length > MAX_PATTERN_SIZE) {
02470 errorcode = ERR16;
02471 return -1;
02472 }
02473 }
02474
02475
02476
02477 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
02478 ptr++;
02479 length += 2 + 2 * LINK_SIZE;
02480 }
02481 continue;
02482 }
02483
02484
02485
02486
02487
02488 default:
02489 NORMAL_CHAR:
02490 length += 2;
02491 lastitemlength = 1;
02492
02493 if (c > 127) {
02494 int i;
02495 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
02496 if (c <= kjs_pcre_utf8_table1[i])
02497 break;
02498 length += i;
02499 lastitemlength += i;
02500 }
02501
02502 continue;
02503 }
02504 }
02505
02506 length += 2 + LINK_SIZE;
02507
02508 cd.numCapturingBrackets = bracount;
02509 return length;
02510 }
02511
02512
02513
02514
02515
02516
02517
02518
02519
02520
02521
02522
02523
02524
02525
02526
02527
02528
02529
02530
02531
02532
02533
02534 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorptr)
02535 {
02536 *errorptr = errorText(errorcode);
02537 return 0;
02538 }
02539
02540 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
02541 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
02542 unsigned* numSubpatterns, const char** errorptr,
02543 malloc_t* allocate_function, free_t* free_function)
02544 {
02545
02546
02547 if (!errorptr)
02548 return 0;
02549 *errorptr = NULL;
02550
02551 CompileData cd;
02552
02553 ErrorCode errorcode = ERR0;
02554
02555 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
02556
02557 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
02558 if (errorcode)
02559 return returnError(errorcode, errorptr);
02560
02561 if (length > MAX_PATTERN_SIZE)
02562 return returnError(ERR16, errorptr);
02563
02564 size_t size = length + sizeof(JSRegExp);
02565 JSRegExp* re = reinterpret_cast<JSRegExp*>((*allocate_function)(size));
02566
02567 if (!re)
02568 return returnError(ERR13, errorptr);
02569
02570 re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
02571
02572
02573
02574
02575 const unsigned char* codeStart = (const unsigned char*)(re + 1);
02576
02577
02578
02579
02580
02581 const UChar* ptr = (const UChar*)pattern;
02582 const UChar* patternEnd = pattern + patternLength;
02583 unsigned char* code = (unsigned char*)codeStart;
02584 int firstbyte, reqbyte;
02585 int bracketCount = 0;
02586 if (!cd.needOuterBracket)
02587 compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstbyte, &reqbyte, cd);
02588 else {
02589 *code = OP_BRA;
02590 compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 0, &firstbyte, &reqbyte, cd);
02591 }
02592 re->top_bracket = bracketCount;
02593 re->top_backref = cd.top_backref;
02594
02595
02596
02597 if (errorcode == 0 && ptr < patternEnd)
02598 errorcode = ERR10;
02599
02600
02601
02602
02603 *code++ = OP_END;
02604
02605 ASSERT(code - codeStart <= length);
02606 if (code - codeStart > length)
02607 errorcode = ERR7;
02608
02609
02610
02611
02612 if (re->top_backref > re->top_bracket)
02613 errorcode = ERR15;
02614
02615
02616
02617 if (errorcode != ERR0) {
02618 (*free_function)(reinterpret_cast<void*>(re));
02619 return returnError(errorcode, errorptr);
02620 }
02621
02622
02623
02624
02625
02626
02627
02628
02629
02630
02631
02632 if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
02633 re->options |= IsAnchoredOption;
02634 else {
02635 if (firstbyte < 0) {
02636 firstbyte = (cd.needOuterBracket
02637 ? bracketFindFirstAssertedCharacter(codeStart, false)
02638 : branchFindFirstAssertedCharacter(codeStart, false))
02639 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
02640 }
02641 if (firstbyte >= 0) {
02642 int ch = firstbyte & 255;
02643 if (ch < 127) {
02644 re->first_byte = ((firstbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstbyte;
02645 re->options |= UseFirstByteOptimizationOption;
02646 }
02647 } else {
02648 if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
02649 re->options |= UseMultiLineFirstByteOptimizationOption;
02650 }
02651 }
02652
02653
02654
02655
02656
02657 if (reqbyte >= 0 && (!(re->options & IsAnchoredOption) || (reqbyte & REQ_VARY))) {
02658 int ch = reqbyte & 255;
02659 if (ch < 127) {
02660 re->req_byte = ((reqbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqbyte & ~REQ_IGNORE_CASE) : reqbyte;
02661 re->options |= UseRequiredByteOptimizationOption;
02662 }
02663 }
02664
02665 if (numSubpatterns)
02666 *numSubpatterns = re->top_bracket;
02667 return re;
02668 }
02669
02670 void jsRegExpFree(JSRegExp* re, free_t* free_function)
02671 {
02672 (*free_function)(reinterpret_cast<void*>(re));
02673 }