pcre.c

Go to the documentation of this file.
00001 
00002 /*  $Id: pcre.c 1215 2006-04-06 02:44:20Z karstenw $    */
00003 
00004 /*************************************************
00005 *      Perl-Compatible Regular Expressions       *
00006 *************************************************/
00007 
00008 /*
00009 This is a library of functions to support regular expressions whose syntax
00010 and semantics are as close as possible to those of the Perl 5 language. See
00011 the file Tech.Notes for some information on the internals.
00012 
00013 Written by: Philip Hazel <ph10@cam.ac.uk>
00014 
00015            Copyright (c) 1997-2003 University of Cambridge
00016 
00017 -----------------------------------------------------------------------------
00018 Permission is granted to anyone to use this software for any purpose on any
00019 computer system, and to redistribute it freely, subject to the following
00020 restrictions:
00021 
00022 1. This software is distributed in the hope that it will be useful,
00023    but WITHOUT ANY WARRANTY; without even the implied warranty of
00024    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00025 
00026 2. The origin of this software must not be misrepresented, either by
00027    explicit claim or by omission.
00028 
00029 3. Altered versions must be plainly marked as such, and must not be
00030    misrepresented as being the original software.
00031 
00032 4. If PCRE is embedded in any software that is released under the GNU
00033    General Purpose Licence (GPL), then the terms of that licence shall
00034    supersede any condition above with which it is incompatible.
00035 -----------------------------------------------------------------------------
00036 */
00037 
00038 /* Define DEBUG to get debugging output on stdout. */
00039 
00040 /* #define DEBUG */
00041 #undef DEBUG    /* 2004-10-23 aradke: don't pick these up from system headers */
00042 #undef DPRINTF
00043 
00044 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
00045 inline, and there are *still* stupid compilers about that don't like indented
00046 pre-processor statements. I suppose it's only been 10 years... */
00047 
00048 #ifdef DEBUG
00049 #define DPRINTF(p) printf p
00050 #else
00051 #define DPRINTF(p) /*nothing*/
00052 #endif
00053 
00054 /* Include the internals header, which itself includes Standard C headers plus
00055 the external pcre header. */
00056 
00057 #include "pcre_internal.h"
00058 
00059 
00060 /* Allow compilation as C++ source code, should anybody want to do that. */
00061 
00062 #ifdef __cplusplus
00063 #define class pcre_class
00064 #endif
00065 
00066 
00067 /* Maximum number of items on the nested bracket stacks at compile time. This
00068 applies to the nesting of all kinds of parentheses. It does not limit
00069 un-nested, non-capturing parentheses. This number can be made bigger if
00070 necessary - it is used to dimension one int and one unsigned char vector at
00071 compile time. */
00072 
00073 #define BRASTACK_SIZE 200
00074 
00075 
00076 /* Maximum number of ints of offset to save on the stack for recursive calls.
00077 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
00078 because the offset vector is always a multiple of 3 long. */
00079 
00080 #define REC_STACK_SAVE_MAX 30
00081 
00082 
00083 /* The number of bytes in a literal character string above which we can't add
00084 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
00085 could be 255 when UTF-8 support is excluded, but that means that some of the
00086 test output would be different, which just complicates things.) */
00087 
00088 #define MAXLIT 250
00089 
00090 
00091 /* The maximum remaining length of subject we are prepared to search for a
00092 req_byte match. */
00093 
00094 #define REQ_BYTE_MAX 1000
00095 
00096 
00097 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
00098 the definition is next to the definition of the opcodes in internal.h. */
00099 
00100 static uschar OP_lengths[] = { OP_LENGTHS };
00101 
00102 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
00103 
00104 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
00105 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
00106 
00107 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
00108 are simple data values; negative values are for special things like \d and so
00109 on. Zero means further processing is needed (for things like \x), or the escape
00110 is invalid. */
00111 
00112 static const short int escapes[] = {
00113     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
00114     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
00115   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
00116     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
00117     0, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
00118     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
00119   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
00120     0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
00121     0,      0,  ESC_r, -ESC_s,  ESC_t,      0,      0, -ESC_w,   /* p - w */
00122     0,      0, -ESC_z                                            /* x - z */
00123 };
00124 
00125 /* Tables of names of POSIX character classes and their lengths. The list is
00126 terminated by a zero length entry. The first three must be alpha, upper, lower,
00127 as this is assumed for handling case independence. */
00128 
00129 static const char *posix_names[] = {
00130   "alpha", "lower", "upper",
00131   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
00132   "print", "punct", "space", "word",  "xdigit" };
00133 
00134 static const uschar posix_name_lengths[] = {
00135   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
00136 
00137 /* Table of class bit maps for each POSIX class; up to three may be combined
00138 to form the class. The table for [:blank:] is dynamically modified to remove
00139 the vertical space characters. */
00140 
00141 static const int posix_class_maps[] = {
00142   cbit_lower, cbit_upper, -1,             /* alpha */
00143   cbit_lower, -1,         -1,             /* lower */
00144   cbit_upper, -1,         -1,             /* upper */
00145   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
00146   cbit_print, cbit_cntrl, -1,             /* ascii */
00147   cbit_space, -1,         -1,             /* blank - a GNU extension */
00148   cbit_cntrl, -1,         -1,             /* cntrl */
00149   cbit_digit, -1,         -1,             /* digit */
00150   cbit_graph, -1,         -1,             /* graph */
00151   cbit_print, -1,         -1,             /* print */
00152   cbit_punct, -1,         -1,             /* punct */
00153   cbit_space, -1,         -1,             /* space */
00154   cbit_word,  -1,         -1,             /* word - a Perl extension */
00155   cbit_xdigit,-1,         -1              /* xdigit */
00156 };
00157 
00158 
00159 /* Definition to allow mutual recursion */
00160 
00161 static BOOL
00162   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
00163     BOOL, int, int *, int *, branch_chain *, compile_data *);
00164 
00165 /* Structure for building a chain of data that actually lives on the
00166 stack, for holding the values of the subject pointer at the start of each
00167 subpattern, so as to detect when an empty string has been matched by a
00168 subpattern - to break infinite loops. */
00169 
00170 typedef struct eptrblock {
00171   struct eptrblock *prev;
00172   const uschar *saved_eptr;
00173 } eptrblock;
00174 
00175 /* Flag bits for the match() function */
00176 
00177 #define match_condassert   0x01    /* Called to check a condition assertion */
00178 #define match_isgroup      0x02    /* Set if start of bracketed group */
00179 
00180 /* Non-error returns from the match() function. Error returns are externally
00181 defined PCRE_ERROR_xxx codes, which are all negative. */
00182 
00183 #define MATCH_MATCH        1
00184 #define MATCH_NOMATCH      0
00185 
00186 
00187 
00188 /*************************************************
00189 *               Global variables                 *
00190 *************************************************/
00191 
00192 /* PCRE is thread-clean and doesn't use any global variables in the normal
00193 sense. However, it calls memory allocation and free functions via the two
00194 indirections below, and it can optionally do callouts. These values can be
00195 changed by the caller, but are shared between all threads. However, when
00196 compiling for Virtual Pascal, things are done differently (see pcre.in). */
00197 
00198 #ifndef VPCOMPAT
00199 void *(*pcre_malloc)(size_t) = malloc;
00200 void  (*pcre_free)(void *) = free;
00201 int   (*pcre_callout)(pcre_callout_block *) = NULL;
00202 #endif
00203 
00204 
00205 /*************************************************
00206 *    Macros and tables for character handling    *
00207 *************************************************/
00208 
00209 /* When UTF-8 encoding is being used, a character is no longer just a single
00210 byte. The macros for character handling generate simple sequences when used in
00211 byte-mode, and more complicated ones for UTF-8 characters. */
00212 
00213 #ifndef SUPPORT_UTF8
00214 #define GETCHAR(c, eptr) c = *eptr;
00215 #define GETCHARINC(c, eptr) c = *eptr++;
00216 #define GETCHARINCTEST(c, eptr) c = *eptr++;
00217 #define GETCHARLEN(c, eptr, len) c = *eptr;
00218 #define BACKCHAR(eptr)
00219 
00220 #else   /* SUPPORT_UTF8 */
00221 
00222 /* Get the next UTF-8 character, not advancing the pointer. This is called when
00223 we know we are in UTF-8 mode. */
00224 
00225 #define GETCHAR(c, eptr) \
00226   c = *eptr; \
00227   if ((c & 0xc0) == 0xc0) \
00228     { \
00229     int gcii; \
00230     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
00231     int gcss = 6*gcaa; \
00232     c = (c & utf8_table3[gcaa]) << gcss; \
00233     for (gcii = 1; gcii <= gcaa; gcii++) \
00234       { \
00235       gcss -= 6; \
00236       c |= (eptr[gcii] & 0x3f) << gcss; \
00237       } \
00238     }
00239 
00240 /* Get the next UTF-8 character, advancing the pointer. This is called when we
00241 know we are in UTF-8 mode. */
00242 
00243 #define GETCHARINC(c, eptr) \
00244   c = *eptr++; \
00245   if ((c & 0xc0) == 0xc0) \
00246     { \
00247     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
00248     int gcss = 6*gcaa; \
00249     c = (c & utf8_table3[gcaa]) << gcss; \
00250     while (gcaa-- > 0) \
00251       { \
00252       gcss -= 6; \
00253       c |= (*eptr++ & 0x3f) << gcss; \
00254       } \
00255     }
00256 
00257 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
00258 
00259 #define GETCHARINCTEST(c, eptr) \
00260   c = *eptr++; \
00261   if (md->utf8 && (c & 0xc0) == 0xc0) \
00262     { \
00263     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
00264     int gcss = 6*gcaa; \
00265     c = (c & utf8_table3[gcaa]) << gcss; \
00266     while (gcaa-- > 0) \
00267       { \
00268       gcss -= 6; \
00269       c |= (*eptr++ & 0x3f) << gcss; \
00270       } \
00271     }
00272 
00273 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
00274 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
00275 
00276 #define GETCHARLEN(c, eptr, len) \
00277   c = *eptr; \
00278   if ((c & 0xc0) == 0xc0) \
00279     { \
00280     int gcii; \
00281     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
00282     int gcss = 6*gcaa; \
00283     c = (c & utf8_table3[gcaa]) << gcss; \
00284     for (gcii = 1; gcii <= gcaa; gcii++) \
00285       { \
00286       gcss -= 6; \
00287       c |= (eptr[gcii] & 0x3f) << gcss; \
00288       } \
00289     len += gcaa; \
00290     }
00291 
00292 /* If the pointer is not at the start of a character, move it back until
00293 it is. Called only in UTF-8 mode. */
00294 
00295 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
00296 
00297 #endif
00298 
00299 
00300 
00301 /*************************************************
00302 *             Default character tables           *
00303 *************************************************/
00304 
00305 /* A default set of character tables is included in the PCRE binary. Its source
00306 is built by the maketables auxiliary program, which uses the default C ctypes
00307 functions, and put in the file chartables.c. These tables are used by PCRE
00308 whenever the caller of pcre_compile() does not provide an alternate set of
00309 tables. */
00310 
00311 #include "chartables.c"
00312 
00313 
00314 
00315 #ifdef SUPPORT_UTF8
00316 /*************************************************
00317 *           Tables for UTF-8 support             *
00318 *************************************************/
00319 
00320 /* These are the breakpoints for different numbers of bytes in a UTF-8
00321 character. */
00322 
00323 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
00324 
00325 /* These are the indicator bits and the mask for the data bits to set in the
00326 first byte of a character, indexed by the number of additional bytes. */
00327 
00328 static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
00329 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
00330 
00331 /* Table of the number of extra characters, indexed by the first character
00332 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
00333 0x3d. */
00334 
00335 static uschar utf8_table4[] = {
00336   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00337   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00338   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
00339   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
00340 
00341 
00342 /*************************************************
00343 *       Convert character value to UTF-8         *
00344 *************************************************/
00345 
00346 /* This function takes an integer value in the range 0 - 0x7fffffff
00347 and encodes it as a UTF-8 character in 0 to 6 bytes.
00348 
00349 Arguments:
00350   cvalue     the character value
00351   buffer     pointer to buffer for result - at least 6 bytes long
00352 
00353 Returns:     number of characters placed in the buffer
00354 */
00355 
00356 static int
00357 ord2utf8(int cvalue, uschar *buffer)
00358 {
00359 register int i, j;
00360 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
00361   if (cvalue <= utf8_table1[i]) break;
00362 buffer += i;
00363 for (j = i; j > 0; j--)
00364  {
00365  *buffer-- = 0x80 | (cvalue & 0x3f);
00366  cvalue >>= 6;
00367  }
00368 *buffer = utf8_table2[i] | cvalue;
00369 return i + 1;
00370 }
00371 #endif
00372 
00373 
00374 
00375 /*************************************************
00376 *         Print compiled regex                   *
00377 *************************************************/
00378 
00379 /* The code for doing this is held in a separate file that is also included in
00380 pcretest.c. It defines a function called print_internals(). */
00381 
00382 #ifdef DEBUG
00383 #include "printint.c"
00384 #endif
00385 
00386 
00387 
00388 /*************************************************
00389 *          Return version string                 *
00390 *************************************************/
00391 
00392 #define STRING(a)  # a
00393 #define XSTRING(s) STRING(s)
00394 
00395 const char *
00396 pcre_version(void)
00397 {
00398 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
00399 }
00400 
00401 
00402 
00403 
00404 /*************************************************
00405 * (Obsolete) Return info about compiled pattern  *
00406 *************************************************/
00407 
00408 /* This is the original "info" function. It picks potentially useful data out
00409 of the private structure, but its interface was too rigid. It remains for
00410 backwards compatibility. The public options are passed back in an int - though
00411 the re->options field has been expanded to a long int, all the public options
00412 at the low end of it, and so even on 16-bit systems this will still be OK.
00413 Therefore, I haven't changed the API for pcre_info().
00414 
00415 Arguments:
00416   external_re   points to compiled code
00417   optptr        where to pass back the options
00418   first_byte    where to pass back the first character,
00419                 or -1 if multiline and all branches start ^,
00420                 or -2 otherwise
00421 
00422 Returns:        number of capturing subpatterns
00423                 or negative values on error
00424 */
00425 
00426 int
00427 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
00428 {
00429 const real_pcre *re = (const real_pcre *)external_re;
00430 if (re == NULL) return PCRE_ERROR_NULL;
00431 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
00432 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
00433 if (first_byte != NULL)
00434   *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
00435      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
00436 return re->top_bracket;
00437 }
00438 
00439 
00440 
00441 /*************************************************
00442 *        Return info about compiled pattern      *
00443 *************************************************/
00444 
00445 /* This is a newer "info" function which has an extensible interface so
00446 that additional items can be added compatibly.
00447 
00448 Arguments:
00449   external_re      points to compiled code
00450   extra_data       points extra data, or NULL
00451   what             what information is required
00452   where            where to put the information
00453 
00454 Returns:           0 if data returned, negative on error
00455 */
00456 
00457 int
00458 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
00459   void *where)
00460 {
00461 const real_pcre *re = (const real_pcre *)external_re;
00462 const pcre_study_data *study = NULL;
00463 
00464 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
00465 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
00466 
00467 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
00468   study = extra_data->study_data;
00469 
00470 switch (what)
00471   {
00472   case PCRE_INFO_OPTIONS:
00473   *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
00474   break;
00475 
00476   case PCRE_INFO_SIZE:
00477   *((size_t *)where) = re->size;
00478   break;
00479 
00480   case PCRE_INFO_STUDYSIZE:
00481   *((size_t *)where) = (study == NULL)? 0 : study->size;
00482   break;
00483 
00484   case PCRE_INFO_CAPTURECOUNT:
00485   *((int *)where) = re->top_bracket;
00486   break;
00487 
00488   case PCRE_INFO_BACKREFMAX:
00489   *((int *)where) = re->top_backref;
00490   break;
00491 
00492   case PCRE_INFO_FIRSTBYTE:
00493   *((int *)where) =
00494     ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
00495     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
00496   break;
00497 
00498   case PCRE_INFO_FIRSTTABLE:
00499   *((const uschar **)where) =
00500     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
00501       study->start_bits : NULL;
00502   break;
00503 
00504   case PCRE_INFO_LASTLITERAL:
00505   *((int *)where) =
00506     ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
00507   break;
00508 
00509   case PCRE_INFO_NAMEENTRYSIZE:
00510   *((int *)where) = re->name_entry_size;
00511   break;
00512 
00513   case PCRE_INFO_NAMECOUNT:
00514   *((int *)where) = re->name_count;
00515   break;
00516 
00517   case PCRE_INFO_NAMETABLE:
00518   *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
00519   break;
00520 
00521   default: return PCRE_ERROR_BADOPTION;
00522   }
00523 
00524 return 0;
00525 }
00526 
00527 
00528 
00529 /*************************************************
00530 * Return info about what features are configured *
00531 *************************************************/
00532 
00533 /* This is function which has an extensible interface so that additional items
00534 can be added compatibly.
00535 
00536 Arguments:
00537   what             what information is required
00538   where            where to put the information
00539 
00540 Returns:           0 if data returned, negative on error
00541 */
00542 
00543 int
00544 pcre_config(int what, void *where)
00545 {
00546 switch (what)
00547   {
00548   case PCRE_CONFIG_UTF8:
00549   #ifdef SUPPORT_UTF8
00550   *((int *)where) = 1;
00551   #else
00552   *((int *)where) = 0;
00553   #endif
00554   break;
00555 
00556   case PCRE_CONFIG_NEWLINE:
00557   *((int *)where) = NEWLINE;
00558   break;
00559 
00560   case PCRE_CONFIG_LINK_SIZE:
00561   *((int *)where) = LINK_SIZE;
00562   break;
00563 
00564   case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
00565   *((int *)where) = POSIX_MALLOC_THRESHOLD;
00566   break;
00567 
00568   case PCRE_CONFIG_MATCH_LIMIT:
00569   *((unsigned int *)where) = MATCH_LIMIT;
00570   break;
00571 
00572   default: return PCRE_ERROR_BADOPTION;
00573   }
00574 
00575 return 0;
00576 }
00577 
00578 
00579 
00580 #ifdef DEBUG
00581 /*************************************************
00582 *        Debugging function to print chars       *
00583 *************************************************/
00584 
00585 /* Print a sequence of chars in printable format, stopping at the end of the
00586 subject if the requested.
00587 
00588 Arguments:
00589   p           points to characters
00590   length      number to print
00591   is_subject  TRUE if printing from within md->start_subject
00592   md          pointer to matching data block, if is_subject is TRUE
00593 
00594 Returns:     nothing
00595 */
00596 
00597 static void
00598 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
00599 {
00600 int c;
00601 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
00602 while (length-- > 0)
00603   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
00604 }
00605 #endif
00606 
00607 
00608 
00609 
00610 /*************************************************
00611 *            Handle escapes                      *
00612 *************************************************/
00613 
00614 /* This function is called when a \ has been encountered. It either returns a
00615 positive value for a simple escape such as \n, or a negative value which
00616 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
00617 a positive value greater than 255 may be returned. On entry, ptr is pointing at
00618 the \. On exit, it is on the final character of the escape sequence.
00619 
00620 Arguments:
00621   ptrptr     points to the pattern position pointer
00622   errorptr   points to the pointer to the error message
00623   bracount   number of previous extracting brackets
00624   options    the options bits
00625   isclass    TRUE if inside a character class
00626   cd         pointer to char tables block
00627 
00628 Returns:     zero or positive => a data character
00629              negative => a special escape sequence
00630              on error, errorptr is set
00631 */
00632 
00633 static int
00634 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
00635   int options, BOOL isclass, compile_data *cd)
00636 {
00637 const uschar *ptr = *ptrptr;
00638 int c, i;
00639 
00640 /* If backslash is at the end of the pattern, it's an error. */
00641 
00642 c = *(++ptr);
00643 if (c == 0) *errorptr = ERR1;
00644 
00645 /* Digits or letters may have special meaning; all others are literals. */
00646 
00647 else if (c < '0' || c > 'z') {}
00648 
00649 /* Do an initial lookup in a table. A non-zero result is something that can be
00650 returned immediately. Otherwise further processing may be required. */
00651 
00652 else if ((i = escapes[c - '0']) != 0) c = i;
00653 
00654 /* Escapes that need further processing, or are illegal. */
00655 
00656 else
00657   {
00658   const uschar *oldptr;
00659   switch (c)
00660     {
00661     /* A number of Perl escapes are not handled by PCRE. We give an explicit
00662     error. */
00663 
00664     case 'l':
00665     case 'L':
00666     case 'N':
00667     case 'p':
00668     case 'P':
00669     case 'u':
00670     case 'U':
00671     case 'X':
00672     *errorptr = ERR37;
00673     break;
00674 
00675     /* The handling of escape sequences consisting of a string of digits
00676     starting with one that is not zero is not straightforward. By experiment,
00677     the way Perl works seems to be as follows:
00678 
00679     Outside a character class, the digits are read as a decimal number. If the
00680     number is less than 10, or if there are that many previous extracting
00681     left brackets, then it is a back reference. Otherwise, up to three octal
00682     digits are read to form an escaped byte. Thus \123 is likely to be octal
00683     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
00684     value is greater than 377, the least significant 8 bits are taken. Inside a
00685     character class, \ followed by a digit is always an octal number. */
00686 
00687     case '1': case '2': case '3': case '4': case '5':
00688     case '6': case '7': case '8': case '9':
00689 
00690     if (!isclass)
00691       {
00692       oldptr = ptr;
00693       c -= '0';
00694       while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
00695         c = c * 10 + *(++ptr) - '0';
00696       if (c < 10 || c <= bracount)
00697         {
00698         c = -(ESC_REF + c);
00699         break;
00700         }
00701       ptr = oldptr;      /* Put the pointer back and fall through */
00702       }
00703 
00704     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
00705     generates a binary zero byte and treats the digit as a following literal.
00706     Thus we have to pull back the pointer by one. */
00707 
00708     if ((c = *ptr) >= '8')
00709       {
00710       ptr--;
00711       c = 0;
00712       break;
00713       }
00714 
00715     /* \0 always starts an octal number, but we may drop through to here with a
00716     larger first octal digit. */
00717 
00718     case '0':
00719     c -= '0';
00720     while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
00721       ptr[1] != '8' && ptr[1] != '9')
00722         c = c * 8 + *(++ptr) - '0';
00723     c &= 255;     /* Take least significant 8 bits */
00724     break;
00725 
00726     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
00727     which can be greater than 0xff, but only if the ddd are hex digits. */
00728 
00729     case 'x':
00730 #ifdef SUPPORT_UTF8
00731     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
00732       {
00733       const uschar *pt = ptr + 2;
00734       register int count = 0;
00735       c = 0;
00736       while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
00737         {
00738         count++;
00739         c = c * 16 + cd->lcc[*pt] -
00740           (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
00741         pt++;
00742         }
00743       if (*pt == '}')
00744         {
00745         if (c < 0 || count > 8) *errorptr = ERR34;
00746         ptr = pt;
00747         break;
00748         }
00749       /* If the sequence of hex digits does not end with '}', then we don't
00750       recognize this construct; fall through to the normal \x handling. */
00751       }
00752 #endif
00753 
00754     /* Read just a single hex char */
00755 
00756     c = 0;
00757     while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
00758       {
00759       ptr++;
00760       c = c * 16 + cd->lcc[*ptr] -
00761         (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
00762       }
00763     break;
00764 
00765     /* Other special escapes not starting with a digit are straightforward */
00766 
00767     case 'c':
00768     c = *(++ptr);
00769     if (c == 0)
00770       {
00771       *errorptr = ERR2;
00772       return 0;
00773       }
00774 
00775     /* A letter is upper-cased; then the 0x40 bit is flipped */
00776 
00777     if (c >= 'a' && c <= 'z') c = cd->fcc[c];
00778     c ^= 0x40;
00779     break;
00780 
00781     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
00782     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
00783     for Perl compatibility, it is a literal. This code looks a bit odd, but
00784     there used to be some cases other than the default, and there may be again
00785     in future, so I haven't "optimized" it. */
00786 
00787     default:
00788     if ((options & PCRE_EXTRA) != 0) switch(c)
00789       {
00790       default:
00791       *errorptr = ERR3;
00792       break;
00793       }
00794     break;
00795     }
00796   }
00797 
00798 *ptrptr = ptr;
00799 return c;
00800 }
00801 
00802 
00803 
00804 /*************************************************
00805 *            Check for counted repeat            *
00806 *************************************************/
00807 
00808 /* This function is called when a '{' is encountered in a place where it might
00809 start a quantifier. It looks ahead to see if it really is a quantifier or not.
00810 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
00811 where the ddds are digits.
00812 
00813 Arguments:
00814   p         pointer to the first char after '{'
00815   cd        pointer to char tables block
00816 
00817 Returns:    TRUE or FALSE
00818 */
00819 
00820 static BOOL
00821 is_counted_repeat(const uschar *p, compile_data *cd)
00822 {
00823 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
00824 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
00825 if (*p == '}') return TRUE;
00826 
00827 if (*p++ != ',') return FALSE;
00828 if (*p == '}') return TRUE;
00829 
00830 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
00831 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
00832 return (*p == '}');
00833 }
00834 
00835 
00836 
00837 /*************************************************
00838 *         Read repeat counts                     *
00839 *************************************************/
00840 
00841 /* Read an item of the form {n,m} and return the values. This is called only
00842 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
00843 so the syntax is guaranteed to be correct, but we need to check the values.
00844 
00845 Arguments:
00846   p          pointer to first char after '{'
00847   minp       pointer to int for min
00848   maxp       pointer to int for max
00849              returned as -1 if no max
00850   errorptr   points to pointer to error message
00851   cd         pointer to character tables clock
00852 
00853 Returns:     pointer to '}' on success;
00854              current ptr on error, with errorptr set
00855 */
00856 
00857 static const uschar *
00858 read_repeat_counts(const uschar *p, int *minp, int *maxp,
00859   const char **errorptr, compile_data *cd)
00860 {
00861 int min = 0;
00862 int max = -1;
00863 
00864 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
00865 
00866 if (*p == '}') max = min; else
00867   {
00868   if (*(++p) != '}')
00869     {
00870     max = 0;
00871     while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
00872     if (max < min)
00873       {
00874       *errorptr = ERR4;
00875       return p;
00876       }
00877     }
00878   }
00879 
00880 /* Do paranoid checks, then fill in the required variables, and pass back the
00881 pointer to the terminating '}'. */
00882 
00883 if (min > 65535 || max > 65535)
00884   *errorptr = ERR5;
00885 else
00886   {
00887   *minp = min;
00888   *maxp = max;
00889   }
00890 return p;
00891 }
00892 
00893 
00894 
00895 /*************************************************
00896 *      Find first significant op code            *
00897 *************************************************/
00898 
00899 /* This is called by several functions that scan a compiled expression looking
00900 for a fixed first character, or an anchoring op code etc. It skips over things
00901 that do not influence this. For some calls, a change of option is important.
00902 
00903 Arguments:
00904   code       pointer to the start of the group
00905   options    pointer to external options
00906   optbit     the option bit whose changing is significant, or
00907                zero if none are
00908 
00909 Returns:     pointer to the first significant opcode
00910 */
00911 
00912 static const uschar*
00913 first_significant_code(const uschar *code, int *options, int optbit)
00914 {
00915 for (;;)
00916   {
00917   switch ((int)*code)
00918     {
00919     case OP_OPT:
00920     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
00921       *options = (int)code[1];
00922     code += 2;
00923     break;
00924 
00925     case OP_ASSERT_NOT:
00926     case OP_ASSERTBACK:
00927     case OP_ASSERTBACK_NOT:
00928     do code += GET(code, 1); while (*code == OP_ALT);
00929     /* Fall through */
00930 
00931     case OP_CALLOUT:
00932     case OP_CREF:
00933     case OP_BRANUMBER:
00934     case OP_WORD_BOUNDARY:
00935     case OP_NOT_WORD_BOUNDARY:
00936     code += OP_lengths[*code];
00937     break;
00938 
00939     default:
00940     return code;
00941     }
00942   }
00943 /* Control never reaches here */
00944 }
00945 
00946 
00947 
00948 
00949 /*************************************************
00950 *        Find the fixed length of a pattern      *
00951 *************************************************/
00952 
00953 /* Scan a pattern and compute the fixed length of subject that will match it,
00954 if the length is fixed. This is needed for dealing with backward assertions.
00955 In UTF8 mode, the result is in characters rather than bytes.
00956 
00957 Arguments:
00958   code     points to the start of the pattern (the bracket)
00959   options  the compiling options
00960 
00961 Returns:   the fixed length, or -1 if there is no fixed length,
00962              or -2 if \C was encountered
00963 */
00964 
00965 static int
00966 find_fixedlength(uschar *code, int options)
00967 {
00968 int length = -1;
00969 
00970 register int branchlength = 0;
00971 register uschar *cc = code + 1 + LINK_SIZE;
00972 
00973 /* Scan along the opcodes for this branch. If we get to the end of the
00974 branch, check the length against that of the other branches. */
00975 
00976 for (;;)
00977   {
00978   int d;
00979   register int op = *cc;
00980   if (op >= OP_BRA) op = OP_BRA;
00981 
00982   switch (op)
00983     {
00984     case OP_BRA:
00985     case OP_ONCE:
00986     case OP_COND:
00987     d = find_fixedlength(cc, options);
00988     if (d < 0) return d;
00989     branchlength += d;
00990     do cc += GET(cc, 1); while (*cc == OP_ALT);
00991     cc += 1 + LINK_SIZE;
00992     break;
00993 
00994     /* Reached end of a branch; if it's a ket it is the end of a nested
00995     call. If it's ALT it is an alternation in a nested call. If it is
00996     END it's the end of the outer call. All can be handled by the same code. */
00997 
00998     case OP_ALT:
00999     case OP_KET:
01000     case OP_KETRMAX:
01001     case OP_KETRMIN:
01002     case OP_END:
01003     if (length < 0) length = branchlength;
01004       else if (length != branchlength) return -1;
01005     if (*cc != OP_ALT) return length;
01006     cc += 1 + LINK_SIZE;
01007     branchlength = 0;
01008     break;
01009 
01010     /* Skip over assertive subpatterns */
01011 
01012     case OP_ASSERT:
01013     case OP_ASSERT_NOT:
01014     case OP_ASSERTBACK:
01015     case OP_ASSERTBACK_NOT:
01016     do cc += GET(cc, 1); while (*cc == OP_ALT);
01017     /* Fall through */
01018 
01019     /* Skip over things that don't match chars */
01020 
01021     case OP_REVERSE:
01022     case OP_BRANUMBER:
01023     case OP_CREF:
01024     case OP_OPT:
01025     case OP_CALLOUT:
01026     case OP_SOD:
01027     case OP_SOM:
01028     case OP_EOD:
01029     case OP_EODN:
01030     case OP_CIRC:
01031     case OP_DOLL:
01032     case OP_NOT_WORD_BOUNDARY:
01033     case OP_WORD_BOUNDARY:
01034     cc += OP_lengths[*cc];
01035     break;
01036 
01037     /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
01038     This requires a scan of the string, unfortunately. We assume valid UTF-8
01039     strings, so all we do is reduce the length by one for every byte whose bits
01040     are 10xxxxxx. */
01041 
01042     case OP_CHARS:
01043     branchlength += *(++cc);
01044 #ifdef SUPPORT_UTF8
01045     if ((options & PCRE_UTF8) != 0)
01046       for (d = 1; d <= *cc; d++)
01047         if ((cc[d] & 0xc0) == 0x80) branchlength--;
01048 #endif
01049     cc += *cc + 1;
01050     break;
01051 
01052     /* Handle exact repetitions. The count is already in characters, but we
01053     need to skip over a multibyte character in UTF8 mode.  */
01054 
01055     case OP_EXACT:
01056     branchlength += GET2(cc,1);
01057     cc += 4;
01058 #ifdef SUPPORT_UTF8
01059     if ((options & PCRE_UTF8) != 0)
01060       {
01061       while((*cc & 0x80) == 0x80) cc++;
01062       }
01063 #endif
01064     break;
01065 
01066     case OP_TYPEEXACT:
01067     branchlength += GET2(cc,1);
01068     cc += 4;
01069     break;
01070 
01071     /* Handle single-char matchers */
01072 
01073     case OP_NOT_DIGIT:
01074     case OP_DIGIT:
01075     case OP_NOT_WHITESPACE:
01076     case OP_WHITESPACE:
01077     case OP_NOT_WORDCHAR:
01078     case OP_WORDCHAR:
01079     case OP_ANY:
01080     branchlength++;
01081     cc++;
01082     break;
01083 
01084     /* The single-byte matcher isn't allowed */
01085 
01086     case OP_ANYBYTE:
01087     return -2;
01088 
01089     /* Check a class for variable quantification */
01090 
01091 #ifdef SUPPORT_UTF8
01092     case OP_XCLASS:
01093     cc += GET(cc, 1) - 33;
01094     /* Fall through */
01095 #endif
01096 
01097     case OP_CLASS:
01098     case OP_NCLASS:
01099     cc += 33;
01100 
01101     switch (*cc)
01102       {
01103       case OP_CRSTAR:
01104       case OP_CRMINSTAR:
01105       case OP_CRQUERY:
01106       case OP_CRMINQUERY:
01107       return -1;
01108 
01109       case OP_CRRANGE:
01110       case OP_CRMINRANGE:
01111       if (GET2(cc,1) != GET2(cc,3)) return -1;
01112       branchlength += GET2(cc,1);
01113       cc += 5;
01114       break;
01115 
01116       default:
01117       branchlength++;
01118       }
01119     break;
01120 
01121     /* Anything else is variable length */
01122 
01123     default:
01124     return -1;
01125     }
01126   }
01127 /* Control never gets here */
01128 }
01129 
01130 
01131 
01132 
01133 /*************************************************
01134 *    Scan compiled regex for numbered bracket    *
01135 *************************************************/
01136 
01137 /* This little function scans through a compiled pattern until it finds a
01138 capturing bracket with the given number.
01139 
01140 Arguments:
01141   code        points to start of expression
01142   utf8        TRUE in UTF-8 mode
01143   number      the required bracket number
01144 
01145 Returns:      pointer to the opcode for the bracket, or NULL if not found
01146 */
01147 
01148 static const uschar *
01149 find_bracket(const uschar *code, BOOL utf8, int number)
01150 {
01151 #ifndef SUPPORT_UTF8
01152 utf8 = utf8;               /* Stop pedantic compilers complaining */
01153 #endif
01154 
01155 for (;;)
01156   {
01157   register int c = *code;
01158   if (c == OP_END) return NULL;
01159   else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
01160   else if (c > OP_BRA)
01161     {
01162     int n = c - OP_BRA;
01163     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
01164     if (n == number) return (uschar *)code;
01165     code += OP_lengths[OP_BRA];
01166     }
01167   else
01168     {
01169     code += OP_lengths[c];
01170 
01171     /* In UTF-8 mode, opcodes that are followed by a character may be followed
01172     by a multi-byte character. The length in the table is a minimum, so we have
01173     to scan along to skip the extra characters. All opcodes are less than 128,
01174     so we can use relatively efficient code. */
01175 
01176 #ifdef SUPPORT_UTF8
01177     if (utf8) switch(c)
01178       {
01179       case OP_EXACT:
01180       case OP_UPTO:
01181       case OP_MINUPTO:
01182       case OP_STAR:
01183       case OP_MINSTAR:
01184       case OP_PLUS:
01185       case OP_MINPLUS:
01186       case OP_QUERY:
01187       case OP_MINQUERY:
01188       while ((*code & 0xc0) == 0x80) code++;
01189       break;
01190       }
01191 #endif
01192     }
01193   }
01194 }
01195 
01196 
01197 
01198 /*************************************************
01199 *    Scan compiled branch for non-emptiness      *
01200 *************************************************/
01201 
01202 /* This function scans through a branch of a compiled pattern to see whether it
01203 can match the empty string or not. It is called only from could_be_empty()
01204 below. Note that first_significant_code() skips over assertions. If we hit an
01205 unclosed bracket, we return "empty" - this means we've struck an inner bracket
01206 whose current branch will already have been scanned.
01207 
01208 Arguments:
01209   code        points to start of search
01210   endcode     points to where to stop
01211   utf8        TRUE if in UTF8 mode
01212 
01213 Returns:      TRUE if what is matched could be empty
01214 */
01215 
01216 static BOOL
01217 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
01218 {
01219 register int c;
01220 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
01221      code < endcode;
01222      code = first_significant_code(code + OP_lengths[c], NULL, 0))
01223   {
01224   const uschar *ccode;
01225 
01226   c = *code;
01227 
01228   if (c >= OP_BRA)
01229     {
01230     BOOL empty_branch;
01231     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
01232 
01233     /* Scan a closed bracket */
01234 
01235     empty_branch = FALSE;
01236     do
01237       {
01238       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
01239         empty_branch = TRUE;
01240       code += GET(code, 1);
01241       }
01242     while (*code == OP_ALT);
01243     if (!empty_branch) return FALSE;   /* All branches are non-empty */
01244     code += 1 + LINK_SIZE;
01245     c = *code;
01246     }
01247 
01248   else switch (c)
01249     {
01250     /* Check for quantifiers after a class */
01251 
01252 #ifdef SUPPORT_UTF8
01253     case OP_XCLASS:
01254     ccode = code + GET(code, 1);
01255     goto CHECK_CLASS_REPEAT;
01256 #endif
01257 
01258     case OP_CLASS:
01259     case OP_NCLASS:
01260     ccode = code + 33;
01261 
01262 #ifdef SUPPORT_UTF8
01263     CHECK_CLASS_REPEAT:
01264 #endif
01265 
01266     switch (*ccode)
01267       {
01268       case OP_CRSTAR:            /* These could be empty; continue */
01269       case OP_CRMINSTAR:
01270       case OP_CRQUERY:
01271       case OP_CRMINQUERY:
01272       break;
01273 
01274       default:                   /* Non-repeat => class must match */
01275       case OP_CRPLUS:            /* These repeats aren't empty */
01276       case OP_CRMINPLUS:
01277       return FALSE;
01278 
01279       case OP_CRRANGE:
01280       case OP_CRMINRANGE:
01281       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
01282       break;
01283       }
01284     break;
01285 
01286     /* Opcodes that must match a character */
01287 
01288     case OP_NOT_DIGIT:
01289     case OP_DIGIT:
01290     case OP_NOT_WHITESPACE:
01291     case OP_WHITESPACE:
01292     case OP_NOT_WORDCHAR:
01293     case OP_WORDCHAR:
01294     case OP_ANY:
01295     case OP_ANYBYTE:
01296     case OP_CHARS:
01297     case OP_NOT:
01298     case OP_PLUS:
01299     case OP_MINPLUS:
01300     case OP_EXACT:
01301     case OP_NOTPLUS:
01302     case OP_NOTMINPLUS:
01303     case OP_NOTEXACT:
01304     case OP_TYPEPLUS:
01305     case OP_TYPEMINPLUS:
01306     case OP_TYPEEXACT:
01307     return FALSE;
01308 
01309     /* End of branch */
01310 
01311     case OP_KET:
01312     case OP_KETRMAX:
01313     case OP_KETRMIN:
01314     case OP_ALT:
01315     return TRUE;
01316 
01317     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
01318     followed by a multibyte character */
01319 
01320 #ifdef SUPPORT_UTF8
01321     case OP_STAR:
01322     case OP_MINSTAR:
01323     case OP_QUERY:
01324     case OP_MINQUERY:
01325     case OP_UPTO:
01326     case OP_MINUPTO:
01327     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
01328     break;
01329 #endif
01330     }
01331   }
01332 
01333 return TRUE;
01334 }
01335 
01336 
01337 
01338 /*************************************************
01339 *    Scan compiled regex for non-emptiness       *
01340 *************************************************/
01341 
01342 /* This function is called to check for left recursive calls. We want to check
01343 the current branch of the current pattern to see if it could match the empty
01344 string. If it could, we must look outwards for branches at other levels,
01345 stopping when we pass beyond the bracket which is the subject of the recursion.
01346 
01347 Arguments:
01348   code        points to start of the recursion
01349   endcode     points to where to stop (current RECURSE item)
01350   bcptr       points to the chain of current (unclosed) branch starts
01351   utf8        TRUE if in UTF-8 mode
01352 
01353 Returns:      TRUE if what is matched could be empty
01354 */
01355 
01356 static BOOL
01357 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
01358   BOOL utf8)
01359 {
01360 while (bcptr != NULL && bcptr->current >= code)
01361   {
01362   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
01363   bcptr = bcptr->outer;
01364   }
01365 return TRUE;
01366 }
01367 
01368 
01369 
01370 /*************************************************
01371 *           Check for POSIX class syntax         *
01372 *************************************************/
01373 
01374 /* This function is called when the sequence "[:" or "[." or "[=" is
01375 encountered in a character class. It checks whether this is followed by an
01376 optional ^ and then a sequence of letters, terminated by a matching ":]" or
01377 ".]" or "=]".
01378 
01379 Argument:
01380   ptr      pointer to the initial [
01381   endptr   where to return the end pointer
01382   cd       pointer to compile data
01383 
01384 Returns:   TRUE or FALSE
01385 */
01386 
01387 static BOOL
01388 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
01389 {
01390 int terminator;          /* Don't combine these lines; the Solaris cc */
01391 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
01392 if (*(++ptr) == '^') ptr++;
01393 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
01394 if (*ptr == terminator && ptr[1] == ']')
01395   {
01396   *endptr = ptr;
01397   return TRUE;
01398   }
01399 return FALSE;
01400 }
01401 
01402 
01403 
01404 
01405 /*************************************************
01406 *          Check POSIX class name                *
01407 *************************************************/
01408 
01409 /* This function is called to check the name given in a POSIX-style class entry
01410 such as [:alnum:].
01411 
01412 Arguments:
01413   ptr        points to the first letter
01414   len        the length of the name
01415 
01416 Returns:     a value representing the name, or -1 if unknown
01417 */
01418 
01419 static int
01420 check_posix_name(const uschar *ptr, int len)
01421 {
01422 register int yield = 0;
01423 while (posix_name_lengths[yield] != 0)
01424   {
01425   if (len == posix_name_lengths[yield] &&
01426     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
01427   yield++;
01428   }
01429 return -1;
01430 }
01431 
01432 
01433 
01434 
01435 /*************************************************
01436 *           Compile one branch                   *
01437 *************************************************/
01438 
01439 /* Scan the pattern, compiling it into the code vector. If the options are
01440 changed during the branch, the pointer is used to change the external options
01441 bits.
01442 
01443 Arguments:
01444   optionsptr     pointer to the option bits
01445   brackets       points to number of extracting brackets used
01446   code           points to the pointer to the current code point
01447   ptrptr         points to the current pattern pointer
01448   errorptr       points to pointer to error message
01449   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
01450   reqbyteptr     set to the last literal character required, else < 0
01451   bcptr          points to current branch chain
01452   cd             contains pointers to tables etc.
01453 
01454 Returns:         TRUE on success
01455                  FALSE, with *errorptr set on error
01456 */
01457 
01458 static BOOL
01459 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
01460   const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
01461   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
01462 {
01463 int repeat_type, op_type;
01464 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
01465 int bravalue = 0;
01466 int length;
01467 int greedy_default, greedy_non_default;
01468 int firstbyte, reqbyte;
01469 int zeroreqbyte, zerofirstbyte;
01470 int req_caseopt, reqvary, tempreqvary;
01471 int condcount = 0;
01472 int options = *optionsptr;
01473 register int c;
01474 register uschar *code = *codeptr;
01475 uschar *tempcode;
01476 BOOL inescq = FALSE;
01477 BOOL groupsetfirstbyte = FALSE;
01478 const uschar *ptr = *ptrptr;
01479 const uschar *tempptr;
01480 uschar *previous = NULL;
01481 uschar class[32];
01482 
01483 #ifdef SUPPORT_UTF8
01484 BOOL class_utf8;
01485 BOOL utf8 = (options & PCRE_UTF8) != 0;
01486 uschar *class_utf8data;
01487 uschar utf8_char[6];
01488 #else
01489 BOOL utf8 = FALSE;
01490 #endif
01491 
01492 /* Set up the default and non-default settings for greediness */
01493 
01494 greedy_default = ((options & PCRE_UNGREEDY) != 0);
01495 greedy_non_default = greedy_default ^ 1;
01496 
01497 /* Initialize no first char, no required char. REQ_UNSET means "no char
01498 matching encountered yet". It gets changed to REQ_NONE if we hit something that
01499 matches a non-fixed char first char; reqbyte just remains unset if we never
01500 find one.
01501 
01502 When we hit a repeat whose minimum is zero, we may have to adjust these values
01503 to take the zero repeat into account. This is implemented by setting them to
01504 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
01505 item types that can be repeated set these backoff variables appropriately. */
01506 
01507 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
01508 
01509 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
01510 according to the current setting of the caseless flag. REQ_CASELESS is a bit
01511 value > 255. It is added into the firstbyte or reqbyte variables to record the
01512 case status of the value. */
01513 
01514 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
01515 
01516 /* Switch on next character until the end of the branch */
01517 
01518 for (;; ptr++)
01519   {
01520   BOOL negate_class;
01521   BOOL possessive_quantifier;
01522   int class_charcount;
01523   int class_lastchar;
01524   int newoptions;
01525   int recno;
01526   int skipbytes;
01527   int subreqbyte;
01528   int subfirstbyte;
01529 
01530   c = *ptr;
01531   if (inescq && c != 0) goto NORMAL_CHAR;
01532 
01533   if ((options & PCRE_EXTENDED) != 0)
01534     {
01535     if ((cd->ctypes[c] & ctype_space) != 0) continue;
01536     if (c == '#')
01537       {
01538       /* The space before the ; is to avoid a warning on a silly compiler
01539       on the Macintosh. */
01540       while ((c = *(++ptr)) != 0 && c != NEWLINE) ; /*** FIXME: test LF too? ***/
01541       if (c != 0) continue;   /* Else fall through to handle end of string */
01542       }
01543     }
01544 
01545   switch(c)
01546     {
01547     /* The branch terminates at end of string, |, or ). */
01548 
01549     case 0:
01550     case '|':
01551     case ')':
01552     *firstbyteptr = firstbyte;
01553     *reqbyteptr = reqbyte;
01554     *codeptr = code;
01555     *ptrptr = ptr;
01556     return TRUE;
01557 
01558     /* Handle single-character metacharacters. In multiline mode, ^ disables
01559     the setting of any following char as a first character. */
01560 
01561     case '^':
01562     if ((options & PCRE_MULTILINE) != 0)
01563       {
01564       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
01565       }
01566     previous = NULL;
01567     *code++ = OP_CIRC;
01568     break;
01569 
01570     case '$':
01571     previous = NULL;
01572     *code++ = OP_DOLL;
01573     break;
01574 
01575     /* There can never be a first char if '.' is first, whatever happens about
01576     repeats. The value of reqbyte doesn't change either. */
01577 
01578     case '.':
01579     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
01580     zerofirstbyte = firstbyte;
01581     zeroreqbyte = reqbyte;
01582     previous = code;
01583     *code++ = OP_ANY;
01584     break;
01585 
01586     /* Character classes. If the included characters are all < 255 in value, we
01587     build a 32-byte bitmap of the permitted characters, except in the special
01588     case where there is only one such character. For negated classes, we build
01589     the map as usual, then invert it at the end. However, we use a different
01590     opcode so that data characters > 255 can be handled correctly.
01591 
01592     If the class contains characters outside the 0-255 range, a different
01593     opcode is compiled. It may optionally have a bit map for characters < 256,
01594     but those above are are explicitly listed afterwards. A flag byte tells
01595     whether the bitmap is present, and whether this is a negated class or not.
01596     */
01597 
01598     case '[':
01599     previous = code;
01600 
01601     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
01602     they are encountered at the top level, so we'll do that too. */
01603 
01604     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
01605         check_posix_syntax(ptr, &tempptr, cd))
01606       {
01607       *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
01608       goto FAILED;
01609       }
01610 
01611     /* If the first character is '^', set the negation flag and skip it. */
01612 
01613     if ((c = *(++ptr)) == '^')
01614       {
01615       negate_class = TRUE;
01616       c = *(++ptr);
01617       }
01618     else
01619       {
01620       negate_class = FALSE;
01621       }
01622 
01623     /* Keep a count of chars with values < 256 so that we can optimize the case
01624     of just a single character (as long as it's < 256). For higher valued UTF-8
01625     characters, we don't yet do any optimization. */
01626 
01627     class_charcount = 0;
01628     class_lastchar = -1;
01629 
01630 #ifdef SUPPORT_UTF8
01631     class_utf8 = FALSE;                       /* No chars >= 256 */
01632     class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
01633 #endif
01634 
01635     /* Initialize the 32-char bit map to all zeros. We have to build the
01636     map in a temporary bit of store, in case the class contains only 1
01637     character (< 256), because in that case the compiled code doesn't use the
01638     bit map. */
01639 
01640     memset(class, 0, 32 * sizeof(uschar));
01641 
01642     /* Process characters until ] is reached. By writing this as a "do" it
01643     means that an initial ] is taken as a data character. The first pass
01644     through the regex checked the overall syntax, so we don't need to be very
01645     strict here. At the start of the loop, c contains the first byte of the
01646     character. */
01647 
01648     do
01649       {
01650 #ifdef SUPPORT_UTF8
01651       if (utf8 && c > 127)
01652         {                           /* Braces are required because the */
01653         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
01654         }
01655 #endif
01656 
01657       /* Inside \Q...\E everything is literal except \E */
01658 
01659       if (inescq)
01660         {
01661         if (c == '\\' && ptr[1] == 'E')
01662           {
01663           inescq = FALSE;
01664           ptr++;
01665           continue;
01666           }
01667         else goto LONE_SINGLE_CHARACTER;
01668         }
01669 
01670       /* Handle POSIX class names. Perl allows a negation extension of the
01671       form [:^name:]. A square bracket that doesn't match the syntax is
01672       treated as a literal. We also recognize the POSIX constructions
01673       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
01674       5.6 and 5.8 do. */
01675 
01676       if (c == '[' &&
01677           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
01678           check_posix_syntax(ptr, &tempptr, cd))
01679         {
01680         BOOL local_negate = FALSE;
01681         int posix_class, i;
01682         register const uschar *cbits = cd->cbits;
01683 
01684         if (ptr[1] != ':')
01685           {
01686           *errorptr = ERR31;
01687           goto FAILED;
01688           }
01689 
01690         ptr += 2;
01691         if (*ptr == '^')
01692           {
01693           local_negate = TRUE;
01694           ptr++;
01695           }
01696 
01697         posix_class = check_posix_name(ptr, tempptr - ptr);
01698         if (posix_class < 0)
01699           {
01700           *errorptr = ERR30;
01701           goto FAILED;
01702           }
01703 
01704         /* If matching is caseless, upper and lower are converted to
01705         alpha. This relies on the fact that the class table starts with
01706         alpha, lower, upper as the first 3 entries. */
01707 
01708         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
01709           posix_class = 0;
01710 
01711         /* Or into the map we are building up to 3 of the static class
01712         tables, or their negations. The [:blank:] class sets up the same
01713         chars as the [:space:] class (all white space). We remove the vertical
01714         white space chars afterwards. */
01715 
01716         posix_class *= 3;
01717         for (i = 0; i < 3; i++)
01718         {
01719             BOOL lisblank = strncmp((char *)ptr, "blank", 5) == 0;
01720             int taboffset = posix_class_maps[posix_class + i];
01721 
01722             if (taboffset < 0)
01723                 break;
01724 
01725             if (local_negate)
01726             {
01727                 for (c = 0; c < 32; c++)
01728                     class[c] |= ~cbits[c+taboffset];
01729 
01730                 if (lisblank)
01731                     class[1] |= 0x3c;
01732             }
01733             else
01734             {
01735                 for (c = 0; c < 32; c++)
01736                     class[c] |= cbits[c+taboffset];
01737 
01738                 if (lisblank)
01739                     class[1] &= ~0x3c;
01740             }
01741         }
01742         
01743         ptr = tempptr + 1;
01744         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
01745         continue;    /* End of POSIX syntax handling */
01746         }
01747 
01748       /* Backslash may introduce a single character, or it may introduce one
01749       of the specials, which just set a flag. Escaped items are checked for
01750       validity in the pre-compiling pass. The sequence \b is a special case.
01751       Inside a class (and only there) it is treated as backspace. Elsewhere
01752       it marks a word boundary. Other escapes have preset maps ready to
01753       or into the one we are building. We assume they have more than one
01754       character in them, so set class_charcount bigger than one. */
01755 
01756       if (c == '\\')
01757         {
01758         c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
01759         if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
01760 
01761         if (-c == ESC_Q)            /* Handle start of quoted string */
01762           {
01763           if (ptr[1] == '\\' && ptr[2] == 'E')
01764             {
01765             ptr += 2; /* avoid empty string */
01766             }
01767           else inescq = TRUE;
01768           continue;
01769           }
01770 
01771         else if (c < 0)
01772           {
01773           register const uschar *cbits = cd->cbits;
01774           class_charcount = 10;     /* Greater than 1 is what matters */
01775           switch (-c)
01776             {
01777             case ESC_d:
01778             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
01779             continue;
01780 
01781             case ESC_D:
01782             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
01783             continue;
01784 
01785             case ESC_w:
01786             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
01787             continue;
01788 
01789             case ESC_W:
01790             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
01791             continue;
01792 
01793             case ESC_s:
01794             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
01795             class[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
01796             continue;
01797 
01798             case ESC_S:
01799             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
01800             class[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
01801             continue;
01802 
01803             /* Unrecognized escapes are faulted if PCRE is running in its
01804             strict mode. By default, for compatibility with Perl, they are
01805             treated as literals. */
01806 
01807             default:
01808             if ((options & PCRE_EXTRA) != 0)
01809               {
01810               *errorptr = ERR7;
01811               goto FAILED;
01812               }
01813             c = *ptr;    /* The final character */
01814             }
01815           }
01816 
01817         /* Fall through if we have a single character (c >= 0). This may be
01818         > 256 in UTF-8 mode. */
01819 
01820         }   /* End of backslash handling */
01821 
01822       /* A single character may be followed by '-' to form a range. However,
01823       Perl does not permit ']' to be the end of the range. A '-' character
01824       here is treated as a literal. */
01825 
01826       if (ptr[1] == '-' && ptr[2] != ']')
01827         {
01828         int d;
01829         ptr += 2;
01830 
01831 #ifdef SUPPORT_UTF8
01832         if (utf8)
01833           {                           /* Braces are required because the */
01834           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
01835           }
01836         else
01837 #endif
01838         d = *ptr;
01839 
01840         /* The second part of a range can be a single-character escape, but
01841         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
01842         in such circumstances. */
01843 
01844         if (d == '\\')
01845           {
01846           const uschar *oldptr = ptr;
01847           d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
01848 
01849           /* \b is backslash; any other special means the '-' was literal */
01850 
01851           if (d < 0)
01852             {
01853             if (d == -ESC_b) d = '\b'; else
01854               {
01855               ptr = oldptr - 2;
01856               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
01857               }
01858             }
01859           }
01860 
01861         /* Check that the two values are in the correct order */
01862 
01863         if (d < c)
01864           {
01865           *errorptr = ERR8;
01866           goto FAILED;
01867           }
01868 
01869         /* If d is greater than 255, we can't just use the bit map, so set up
01870         for the UTF-8 supporting class type. If we are not caseless, we can
01871         just set up a single range. If we are caseless, the characters < 256
01872         are handled with a bitmap, in order to get the case-insensitive
01873         handling. */
01874 
01875 #ifdef SUPPORT_UTF8
01876         if (d > 255)
01877           {
01878           class_utf8 = TRUE;
01879           *class_utf8data++ = XCL_RANGE;
01880           if ((options & PCRE_CASELESS) == 0)
01881             {
01882             class_utf8data += ord2utf8(c, class_utf8data);
01883             class_utf8data += ord2utf8(d, class_utf8data);
01884             continue;  /* Go get the next char in the class */
01885             }
01886           class_utf8data += ord2utf8(256, class_utf8data);
01887           class_utf8data += ord2utf8(d, class_utf8data);
01888           d = 255;
01889           /* Fall through */
01890           }
01891 #endif
01892         /* We use the bit map if the range is entirely < 255, or if part of it
01893         is < 255 and matching is caseless. */
01894 
01895         for (; c <= d; c++)
01896           {
01897           class[c/8] |= (1 << (c&7));
01898           if ((options & PCRE_CASELESS) != 0)
01899             {
01900             int uc = cd->fcc[c];           /* flip case */
01901             class[uc/8] |= (1 << (uc&7));
01902             }
01903           class_charcount++;                /* in case a one-char range */
01904           class_lastchar = c;
01905           }
01906 
01907         continue;   /* Go get the next char in the class */
01908         }
01909 
01910       /* Handle a lone single character - we can get here for a normal
01911       non-escape char, or after \ that introduces a single character. */
01912 
01913       LONE_SINGLE_CHARACTER:
01914 
01915       /* Handle a multibyte character */
01916 
01917 #ifdef SUPPORT_UTF8
01918       if (utf8 && c > 255)
01919         {
01920         class_utf8 = TRUE;
01921         *class_utf8data++ = XCL_SINGLE;
01922         class_utf8data += ord2utf8(c, class_utf8data);
01923         }
01924       else
01925 #endif
01926       /* Handle a single-byte character */
01927         {
01928         class [c/8] |= (1 << (c&7));
01929         if ((options & PCRE_CASELESS) != 0)
01930           {
01931           c = cd->fcc[c];   /* flip case */
01932           class[c/8] |= (1 << (c&7));
01933           }
01934         class_charcount++;
01935         class_lastchar = c;
01936         }
01937       }
01938 
01939     /* Loop until ']' reached; the check for end of string happens inside the
01940     loop. This "while" is the end of the "do" above. */
01941 
01942     while ((c = *(++ptr)) != ']' || inescq);
01943 
01944     /* If class_charcount is 1, we saw precisely one character with a value <
01945     256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
01946     the one character is < 128. In non-UTF-8 mode we can always optimize.
01947 
01948     The optimization throws away the bit map. We turn the item into a
01949     1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
01950     that OP_NOT does not support multibyte characters. In the positive case, it
01951     can cause firstbyte to be set. Otherwise, there can be no first char if
01952     this item is first, whatever repeat count may follow. In the case of
01953     reqbyte, save the previous value for reinstating. */
01954 
01955 #ifdef SUPPORT_UTF8
01956     if (class_charcount == 1 &&
01957           (!utf8 ||
01958           (!class_utf8 && class_lastchar < 128)))
01959 #else
01960     if (class_charcount == 1)
01961 #endif
01962       {
01963       zeroreqbyte = reqbyte;
01964       if (negate_class)
01965         {
01966         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
01967         zerofirstbyte = firstbyte;
01968         *code++ = OP_NOT;
01969         }
01970       else
01971         {
01972         if (firstbyte == REQ_UNSET)
01973           {
01974           zerofirstbyte = REQ_NONE;
01975           firstbyte = class_lastchar | req_caseopt;
01976           }
01977         else
01978           {
01979           zerofirstbyte = firstbyte;
01980           reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
01981           }
01982         *code++ = OP_CHARS;
01983         *code++ = 1;
01984         }
01985       *code++ = class_lastchar;
01986       break;  /* End of class handling */
01987       }       /* End of 1-byte optimization */
01988 
01989     /* Otherwise, if this is the first thing in the branch, there can be no
01990     first char setting, whatever the repeat count. Any reqbyte setting must
01991     remain unchanged after any kind of repeat. */
01992 
01993     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
01994     zerofirstbyte = firstbyte;
01995     zeroreqbyte = reqbyte;
01996 
01997     /* If there are characters with values > 255, we have to compile an
01998     extended class, with its own opcode. If there are no characters < 256,
01999     we can omit the bitmap. */
02000 
02001 #ifdef SUPPORT_UTF8
02002     if (class_utf8)
02003       {
02004       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
02005       *code++ = OP_XCLASS;
02006       code += LINK_SIZE;
02007       *code = negate_class? XCL_NOT : 0;
02008 
02009       /* If the map is required, install it, and move on to the end of
02010       the extra data */
02011 
02012       if (class_charcount > 0)
02013         {
02014         *code++ |= XCL_MAP;
02015         memcpy(code, class, 32);
02016         code = class_utf8data;
02017         }
02018 
02019       /* If the map is not required, slide down the extra data. */
02020 
02021       else
02022         {
02023         int len = class_utf8data - (code + 33);
02024         memmove(code + 1, code + 33, len);
02025         code += len + 1;
02026         }
02027 
02028       /* Now fill in the complete length of the item */
02029 
02030       PUT(previous, 1, code - previous);
02031       break;   /* End of class handling */
02032       }
02033 #endif
02034 
02035     /* If there are no characters > 255, negate the 32-byte map if necessary,
02036     and copy it into the code vector. If this is the first thing in the branch,
02037     there can be no first char setting, whatever the repeat count. Any reqbyte
02038     setting must remain unchanged after any kind of repeat. */
02039 
02040     if (negate_class)
02041       {
02042       *code++ = OP_NCLASS;
02043       for (c = 0; c < 32; c++) code[c] = ~class[c];
02044       }
02045     else
02046       {
02047       *code++ = OP_CLASS;
02048       memcpy(code, class, 32);
02049       }
02050     code += 32;
02051     break;
02052 
02053     /* Various kinds of repeat */
02054 
02055     case '{':
02056     if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
02057     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
02058     if (*errorptr != NULL) goto FAILED;
02059     goto REPEAT;
02060 
02061     case '*':
02062     repeat_min = 0;
02063     repeat_max = -1;
02064     goto REPEAT;
02065 
02066     case '+':
02067     repeat_min = 1;
02068     repeat_max = -1;
02069     goto REPEAT;
02070 
02071     case '?':
02072     repeat_min = 0;
02073     repeat_max = 1;
02074 
02075     REPEAT:
02076     if (previous == NULL)
02077       {
02078       *errorptr = ERR9;
02079       goto FAILED;
02080       }
02081 
02082     if (repeat_min == 0)
02083       {
02084       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
02085       reqbyte = zeroreqbyte;        /* Ditto */
02086       }
02087 
02088     /* Remember whether this is a variable length repeat */
02089 
02090     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
02091 
02092     op_type = 0;                    /* Default single-char op codes */
02093     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
02094 
02095     /* Save start of previous item, in case we have to move it up to make space
02096     for an inserted OP_ONCE for the additional '+' extension. */
02097 
02098     tempcode = previous;
02099 
02100     /* If the next character is '+', we have a possessive quantifier. This
02101     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
02102     If the next character is '?' this is a minimizing repeat, by default,
02103     but if PCRE_UNGREEDY is set, it works the other way round. We change the
02104     repeat type to the non-default. */
02105 
02106     if (ptr[1] == '+')
02107       {
02108       repeat_type = 0;                  /* Force greedy */
02109       possessive_quantifier = TRUE;
02110       ptr++;
02111       }
02112     else if (ptr[1] == '?')
02113       {
02114       repeat_type = greedy_non_default;
02115       ptr++;
02116       }
02117     else repeat_type = greedy_default;
02118 
02119     /* If previous was a recursion, we need to wrap it inside brackets so that
02120     it can be replicated if necessary. */
02121 
02122     if (*previous == OP_RECURSE)
02123       {
02124       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
02125       code += 1 + LINK_SIZE;
02126       *previous = OP_BRA;
02127       PUT(previous, 1, code - previous);
02128       *code = OP_KET;
02129       PUT(code, 1, code - previous);
02130       code += 1 + LINK_SIZE;
02131       }
02132 
02133     /* If previous was a string of characters, chop off the last one and use it
02134     as the subject of the repeat. If there was only one character, we can
02135     abolish the previous item altogether. If a one-char item has a minumum of
02136     more than one, ensure that it is set in reqbyte - it might not be if a
02137     sequence such as x{3} is the first thing in a branch because the x will
02138     have gone into firstbyte instead.  */
02139 
02140     if (*previous == OP_CHARS)
02141       {
02142       /* Deal with UTF-8 characters that take up more than one byte. It's
02143       easier to write this out separately than try to macrify it. Use c to
02144       hold the length of the character in bytes, plus 0x80 to flag that it's a
02145       length rather than a small character. */
02146 
02147 #ifdef SUPPORT_UTF8
02148       if (utf8 && (code[-1] & 0x80) != 0)
02149         {
02150         uschar *lastchar = code - 1;
02151         while((*lastchar & 0xc0) == 0x80) lastchar--;
02152         c = code - lastchar;            /* Length of UTF-8 character */
02153         memcpy(utf8_char, lastchar, c); /* Save the char */
02154         if (lastchar == previous + 2)   /* There was only one character */
02155           {
02156           code = previous;              /* Abolish the previous item */
02157           }
02158         else
02159           {
02160           previous[1] -= c;             /* Adjust length of previous */
02161           code = lastchar;              /* Lost char off the end */
02162           tempcode = code;              /* Adjust position to be moved for '+' */
02163           }
02164         c |= 0x80;                      /* Flag c as a length */
02165         }
02166       else
02167 #endif
02168 
02169       /* Handle the case of a single byte - either with no UTF8 support, or
02170       with UTF-8 disabled, or for a UTF-8 character < 128. */
02171 
02172         {
02173         c = *(--code);
02174         if (code == previous + 2)   /* There was only one character */
02175           {
02176           code = previous;              /* Abolish the previous item */
02177           if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
02178           }
02179         else
02180           {
02181           previous[1]--;             /* adjust length */
02182           tempcode = code;           /* Adjust position to be moved for '+' */
02183           }
02184         }
02185 
02186       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
02187       }
02188 
02189     /* If previous was a single negated character ([^a] or similar), we use
02190     one of the special opcodes, replacing it. The code is shared with single-
02191     character repeats by setting opt_type to add a suitable offset into
02192     repeat_type. OP_NOT is currently used only for single-byte chars. */
02193 
02194     else if (*previous == OP_NOT)
02195       {
02196       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
02197       c = previous[1];
02198       code = previous;
02199       goto OUTPUT_SINGLE_REPEAT;
02200       }
02201 
02202     /* If previous was a character type match (\d or similar), abolish it and
02203     create a suitable repeat item. The code is shared with single-character
02204     repeats by setting op_type to add a suitable offset into repeat_type. */
02205 
02206     else if (*previous < OP_EODN)
02207       {
02208       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
02209       c = *previous;
02210       code = previous;
02211 
02212       OUTPUT_SINGLE_REPEAT:
02213 
02214       /* If the maximum is zero then the minimum must also be zero; Perl allows
02215       this case, so we do too - by simply omitting the item altogether. */
02216 
02217       if (repeat_max == 0) goto END_REPEAT;
02218 
02219       /* Combine the op_type with the repeat_type */
02220 
02221       repeat_type += op_type;
02222 
02223       /* A minimum of zero is handled either as the special case * or ?, or as
02224       an UPTO, with the maximum given. */
02225 
02226       if (repeat_min == 0)
02227         {
02228         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
02229           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
02230         else
02231           {
02232           *code++ = OP_UPTO + repeat_type;
02233           PUT2INC(code, 0, repeat_max);
02234           }
02235         }
02236 
02237       /* The case {1,} is handled as the special case + */
02238 
02239       else if (repeat_min == 1 && repeat_max == -1)
02240         *code++ = OP_PLUS + repeat_type;
02241 
02242       /* The case {n,n} is just an EXACT, while the general case {n,m} is
02243       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
02244 
02245       else
02246         {
02247         if (repeat_min != 1)
02248           {
02249           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
02250           PUT2INC(code, 0, repeat_min);
02251           }
02252 
02253         /* If the mininum is 1 and the previous item was a character string,
02254         we either have to put back the item that got cancelled if the string
02255         length was 1, or add the character back onto the end of a longer
02256         string. For a character type nothing need be done; it will just get
02257         put back naturally. Note that the final character is always going to
02258         get added below, so we leave code ready for its insertion. */
02259 
02260         else if (*previous == OP_CHARS)
02261           {
02262           if (code == previous) code += 2; else
02263 
02264           /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
02265           bit set as a flag. The length will always be between 2 and 6. */
02266 
02267 #ifdef SUPPORT_UTF8
02268           if (utf8 && c >= 128) previous[1] += c & 7; else
02269 #endif
02270           previous[1]++;
02271           }
02272 
02273         /*  For a single negated character we also have to put back the
02274         item that got cancelled. At present this applies only to single byte
02275         characters in any mode. */
02276 
02277         else if (*previous == OP_NOT) code++;
02278 
02279         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
02280         we have to insert the character for the previous code. In UTF-8 mode,
02281         long characters have their length in c, with the 0x80 bit as a flag. */
02282 
02283         if (repeat_max < 0)
02284           {
02285 #ifdef SUPPORT_UTF8
02286           if (utf8 && c >= 128)
02287             {
02288             memcpy(code, utf8_char, c & 7);
02289             code += c & 7;
02290             }
02291           else
02292 #endif
02293           *code++ = c;
02294           *code++ = OP_STAR + repeat_type;
02295           }
02296 
02297         /* Else insert an UPTO if the max is greater than the min, again
02298         preceded by the character, for the previously inserted code. */
02299 
02300         else if (repeat_max != repeat_min)
02301           {
02302 #ifdef SUPPORT_UTF8
02303           if (utf8 && c >= 128)
02304             {
02305             memcpy(code, utf8_char, c & 7);
02306             code += c & 7;
02307             }
02308           else
02309 #endif
02310           *code++ = c;
02311           repeat_max -= repeat_min;
02312           *code++ = OP_UPTO + repeat_type;
02313           PUT2INC(code, 0, repeat_max);
02314           }
02315         }
02316 
02317       /* The character or character type itself comes last in all cases. */
02318 
02319 #ifdef SUPPORT_UTF8
02320       if (utf8 && c >= 128)
02321         {
02322         memcpy(code, utf8_char, c & 7);
02323         code += c & 7;
02324         }
02325       else
02326 #endif
02327 
02328       *code++ = c;
02329       }
02330 
02331     /* If previous was a character class or a back reference, we put the repeat
02332     stuff after it, but just skip the item if the repeat was {0,0}. */
02333 
02334     else if (*previous == OP_CLASS ||
02335              *previous == OP_NCLASS ||
02336 #ifdef SUPPORT_UTF8
02337              *previous == OP_XCLASS ||
02338 #endif
02339              *previous == OP_REF)
02340       {
02341       if (repeat_max == 0)
02342         {
02343         code = previous;
02344         goto END_REPEAT;
02345         }
02346       if (repeat_min == 0 && repeat_max == -1)
02347         *code++ = OP_CRSTAR + repeat_type;
02348       else if (repeat_min == 1 && repeat_max == -1)
02349         *code++ = OP_CRPLUS + repeat_type;
02350       else if (repeat_min == 0 && repeat_max == 1)
02351         *code++ = OP_CRQUERY + repeat_type;
02352       else
02353         {
02354         *code++ = OP_CRRANGE + repeat_type;
02355         PUT2INC(code, 0, repeat_min);
02356         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
02357         PUT2INC(code, 0, repeat_max);
02358         }
02359       }
02360 
02361     /* If previous was a bracket group, we may have to replicate it in certain
02362     cases. */
02363 
02364     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
02365              *previous == OP_COND)
02366       {
02367       register int i;
02368       int ketoffset = 0;
02369       int len = code - previous;
02370       uschar *bralink = NULL;
02371 
02372       /* If the maximum repeat count is unlimited, find the end of the bracket
02373       by scanning through from the start, and compute the offset back to it
02374       from the current code pointer. There may be an OP_OPT setting following
02375       the final KET, so we can't find the end just by going back from the code
02376       pointer. */
02377 
02378       if (repeat_max == -1)
02379         {
02380         register uschar *ket = previous;
02381         do ket += GET(ket, 1); while (*ket != OP_KET);
02382         ketoffset = code - ket;
02383         }
02384 
02385       /* The case of a zero minimum is special because of the need to stick
02386       OP_BRAZERO in front of it, and because the group appears once in the
02387       data, whereas in other cases it appears the minimum number of times. For
02388       this reason, it is simplest to treat this case separately, as otherwise
02389       the code gets far too messy. There are several special subcases when the
02390       minimum is zero. */
02391 
02392       if (repeat_min == 0)
02393         {
02394         /* If the maximum is also zero, we just omit the group from the output
02395         altogether. */
02396 
02397         if (repeat_max == 0)
02398           {
02399           code = previous;
02400           goto END_REPEAT;
02401           }
02402 
02403         /* If the maximum is 1 or unlimited, we just have to stick in the
02404         BRAZERO and do no more at this point. */
02405 
02406         if (repeat_max <= 1)
02407           {
02408           memmove(previous+1, previous, len);
02409           code++;
02410           *previous++ = OP_BRAZERO + repeat_type;
02411           }
02412 
02413         /* If the maximum is greater than 1 and limited, we have to replicate
02414         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
02415         The first one has to be handled carefully because it's the original
02416         copy, which has to be moved up. The remainder can be handled by code
02417         that is common with the non-zero minimum case below. We just have to
02418         adjust the value or repeat_max, since one less copy is required. */
02419 
02420         else
02421           {
02422           int offset;
02423           memmove(previous + 2 + LINK_SIZE, previous, len);
02424           code += 2 + LINK_SIZE;
02425           *previous++ = OP_BRAZERO + repeat_type;
02426           *previous++ = OP_BRA;
02427 
02428           /* We chain together the bracket offset fields that have to be
02429           filled in later when the ends of the brackets are reached. */
02430 
02431           offset = (bralink == NULL)? 0 : previous - bralink;
02432           bralink = previous;
02433           PUTINC(previous, 0, offset);
02434           }
02435 
02436         repeat_max--;
02437         }
02438 
02439       /* If the minimum is greater than zero, replicate the group as many
02440       times as necessary, and adjust the maximum to the number of subsequent
02441       copies that we need. If we set a first char from the group, and didn't
02442       set a required char, copy the latter from the former. */
02443 
02444       else
02445         {
02446         if (repeat_min > 1)
02447           {
02448           if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
02449           for (i = 1; i < repeat_min; i++)
02450             {
02451             memcpy(code, previous, len);
02452             code += len;
02453             }
02454           }
02455         if (repeat_max > 0) repeat_max -= repeat_min;
02456         }
02457 
02458       /* This code is common to both the zero and non-zero minimum cases. If
02459       the maximum is limited, it replicates the group in a nested fashion,
02460       remembering the bracket starts on a stack. In the case of a zero minimum,
02461       the first one was set up above. In all cases the repeat_max now specifies
02462       the number of additional copies needed. */
02463 
02464       if (repeat_max >= 0)
02465         {
02466         for (i = repeat_max - 1; i >= 0; i--)
02467           {
02468           *code++ = OP_BRAZERO + repeat_type;
02469 
02470           /* All but the final copy start a new nesting, maintaining the
02471           chain of brackets outstanding. */
02472 
02473           if (i != 0)
02474             {
02475             int offset;
02476             *code++ = OP_BRA;
02477             offset = (bralink == NULL)? 0 : code - bralink;
02478             bralink = code;
02479             PUTINC(code, 0, offset);
02480             }
02481 
02482           memcpy(code, previous, len);
02483           code += len;
02484           }
02485 
02486         /* Now chain through the pending brackets, and fill in their length
02487         fields (which are holding the chain links pro tem). */
02488 
02489         while (bralink != NULL)
02490           {
02491           int oldlinkoffset;
02492           int offset = code - bralink + 1;
02493           uschar *bra = code - offset;
02494           oldlinkoffset = GET(bra, 1);
02495           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
02496           *code++ = OP_KET;
02497           PUTINC(code, 0, offset);
02498           PUT(bra, 1, offset);
02499           }
02500         }
02501 
02502       /* If the maximum is unlimited, set a repeater in the final copy. We
02503       can't just offset backwards from the current code point, because we
02504       don't know if there's been an options resetting after the ket. The
02505       correct offset was computed above. */
02506 
02507       else code[-ketoffset] = OP_KETRMAX + repeat_type;
02508       }
02509 
02510     /* Else there's some kind of shambles */
02511 
02512     else
02513       {
02514       *errorptr = ERR11;
02515       goto FAILED;
02516       }
02517 
02518     /* If the character following a repeat is '+', we wrap the entire repeated
02519     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
02520     Sun's Java package. The repeated item starts at tempcode, not at previous,
02521     which might be the first part of a string whose (former) last char we
02522     repeated. However, we don't support '+' after a greediness '?'. */
02523 
02524     if (possessive_quantifier)
02525       {
02526       int len = code - tempcode;
02527       memmove(tempcode + 1+LINK_SIZE, tempcode, len);
02528       code += 1 + LINK_SIZE;
02529       len += 1 + LINK_SIZE;
02530       tempcode[0] = OP_ONCE;
02531       *code++ = OP_KET;
02532       PUTINC(code, 0, len);
02533       PUT(tempcode, 1, len);
02534       }
02535 
02536     /* In all case we no longer have a previous item. We also set the
02537     "follows varying string" flag for subsequently encountered reqbytes if
02538     it isn't already set and we have just passed a varying length item. */
02539 
02540     END_REPEAT:
02541     previous = NULL;
02542     cd->req_varyopt |= reqvary;
02543     break;
02544 
02545 
02546     /* Start of nested bracket sub-expression, or comment or lookahead or
02547     lookbehind or option setting or condition. First deal with special things
02548     that can come after a bracket; all are introduced by ?, and the appearance
02549     of any of them means that this is not a referencing group. They were
02550     checked for validity in the first pass over the string, so we don't have to
02551     check for syntax errors here.  */
02552 
02553     case '(':
02554     newoptions = options;
02555     skipbytes = 0;
02556 
02557     if (*(++ptr) == '?')
02558       {
02559       int set, unset;
02560       int *optset;
02561 
02562       switch (*(++ptr))
02563         {
02564         case '#':                 /* Comment; skip to ket */
02565         ptr++;
02566         while (*ptr != ')') ptr++;
02567         continue;
02568 
02569         case ':':                 /* Non-extracting bracket */
02570         bravalue = OP_BRA;
02571         ptr++;
02572         break;
02573 
02574         case '(':
02575         bravalue = OP_COND;       /* Conditional group */
02576 
02577         /* Condition to test for recursion */
02578 
02579         if (ptr[1] == 'R')
02580           {
02581           code[1+LINK_SIZE] = OP_CREF;
02582           PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
02583           skipbytes = 3;
02584           ptr += 3;
02585           }
02586 
02587         /* Condition to test for a numbered subpattern match */
02588 
02589         else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
02590           {
02591           int condref;                 /* Don't amalgamate; some compilers */
02592           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
02593           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
02594           if (condref == 0)
02595             {
02596             *errorptr = ERR35;
02597             goto FAILED;
02598             }
02599           ptr++;
02600           code[1+LINK_SIZE] = OP_CREF;
02601           PUT2(code, 2+LINK_SIZE, condref);
02602           skipbytes = 3;
02603           }
02604         /* For conditions that are assertions, we just fall through, having
02605         set bravalue above. */
02606         break;
02607 
02608         case '=':                 /* Positive lookahead */
02609         bravalue = OP_ASSERT;
02610         ptr++;
02611         break;
02612 
02613         case '!':                 /* Negative lookahead */
02614         bravalue = OP_ASSERT_NOT;
02615         ptr++;
02616         break;
02617 
02618         case '<':                 /* Lookbehinds */
02619         switch (*(++ptr))
02620           {
02621           case '=':               /* Positive lookbehind */
02622           bravalue = OP_ASSERTBACK;
02623           ptr++;
02624           break;
02625 
02626           case '!':               /* Negative lookbehind */
02627           bravalue = OP_ASSERTBACK_NOT;
02628           ptr++;
02629           break;
02630           }
02631         break;
02632 
02633         case '>':                 /* One-time brackets */
02634         bravalue = OP_ONCE;
02635         ptr++;
02636         break;
02637 
02638         case 'C':                 /* Callout - may be followed by digits */
02639         *code++ = OP_CALLOUT;
02640           {
02641           int n = 0;
02642           while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
02643             n = n * 10 + *ptr - '0';
02644           if (n > 255)
02645             {
02646             *errorptr = ERR38;
02647             goto FAILED;
02648             }
02649           *code++ = n;
02650           }
02651         previous = NULL;
02652         continue;
02653 
02654         case 'P':                 /* Named subpattern handling */
02655         if (*(++ptr) == '<')      /* Definition */
02656           {
02657           int i, namelen;
02658           uschar *slot = cd->name_table;
02659           const uschar *name;     /* Don't amalgamate; some compilers */
02660           name = ++ptr;           /* grumble at autoincrement in declaration */
02661 
02662           while (*ptr++ != '>') ;
02663           namelen = ptr - name - 1;
02664 
02665           for (i = 0; i < cd->names_found; i++)
02666             {
02667             int crc = memcmp(name, slot+2, namelen);
02668             if (crc == 0)
02669               {
02670               if (slot[2+namelen] == 0)
02671                 {
02672                 *errorptr = ERR43;
02673                 goto FAILED;
02674                 }
02675               crc = -1;             /* Current name is substring */
02676               }
02677             if (crc < 0)
02678               {
02679               memmove(slot + cd->name_entry_size, slot,
02680                 (cd->names_found - i) * cd->name_entry_size);
02681               break;
02682               }
02683             slot += cd->name_entry_size;
02684             }
02685 
02686           PUT2(slot, 0, *brackets + 1);
02687           memcpy(slot + 2, name, namelen);
02688           slot[2+namelen] = 0;
02689           cd->names_found++;
02690           goto NUMBERED_GROUP;
02691           }
02692 
02693         if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
02694           {
02695           int i, namelen;
02696           int type = *ptr++;
02697           const uschar *name = ptr;
02698           uschar *slot = cd->name_table;
02699 
02700           while (*ptr != ')') ptr++;
02701           namelen = ptr - name;
02702 
02703           for (i = 0; i < cd->names_found; i++)
02704             {
02705             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
02706             slot += cd->name_entry_size;
02707             }
02708           if (i >= cd->names_found)
02709             {
02710             *errorptr = ERR15;
02711             goto FAILED;
02712             }
02713 
02714           recno = GET2(slot, 0);
02715 
02716           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
02717 
02718           /* Back reference */
02719 
02720           previous = code;
02721           *code++ = OP_REF;
02722           PUT2INC(code, 0, recno);
02723           cd->backref_map |= (recno < 32)? (1 << recno) : 1;
02724           if (recno > cd->top_backref) cd->top_backref = recno;
02725           continue;
02726           }
02727 
02728         /* Should never happen */
02729         break;
02730 
02731         case 'R':                 /* Pattern recursion */
02732         ptr++;                    /* Same as (?0)      */
02733         /* Fall through */
02734 
02735         /* Recursion or "subroutine" call */
02736 
02737         case '0': case '1': case '2': case '3': case '4':
02738         case '5': case '6': case '7': case '8': case '9':
02739           {
02740           const uschar *called;
02741           recno = 0;
02742 
02743           while ((cd->ctypes[*ptr] & ctype_digit) != 0)
02744             recno = recno * 10 + *ptr++ - '0';
02745 
02746           /* Come here from code above that handles a named recursion */
02747 
02748           HANDLE_RECURSION:
02749 
02750           previous = code;
02751 
02752           /* Find the bracket that is being referenced. Temporarily end the
02753           regex in case it doesn't exist. */
02754 
02755           *code = OP_END;
02756           called = (recno == 0)?
02757             cd->start_code : find_bracket(cd->start_code, utf8, recno);
02758 
02759           if (called == NULL)
02760             {
02761             *errorptr = ERR15;
02762             goto FAILED;
02763             }
02764 
02765           /* If the subpattern is still open, this is a recursive call. We
02766           check to see if this is a left recursion that could loop for ever,
02767           and diagnose that case. */
02768 
02769           if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
02770             {
02771             *errorptr = ERR40;
02772             goto FAILED;
02773             }
02774 
02775           /* Insert the recursion/subroutine item */
02776 
02777           *code = OP_RECURSE;
02778           PUT(code, 1, called - cd->start_code);
02779           code += 1 + LINK_SIZE;
02780           }
02781         continue;
02782 
02783         /* Character after (? not specially recognized */
02784 
02785         default:                  /* Option setting */
02786         set = unset = 0;
02787         optset = &set;
02788 
02789         while (*ptr != ')' && *ptr != ':')
02790           {
02791           switch (*ptr++)
02792             {
02793             case '-': optset = &unset; break;
02794 
02795             case 'i': *optset |= PCRE_CASELESS; break;
02796             case 'm': *optset |= PCRE_MULTILINE; break;
02797             case 's': *optset |= PCRE_DOTALL; break;
02798             case 'x': *optset |= PCRE_EXTENDED; break;
02799             case 'U': *optset |= PCRE_UNGREEDY; break;
02800             case 'X': *optset |= PCRE_EXTRA; break;
02801             }
02802           }
02803 
02804         /* Set up the changed option bits, but don't change anything yet. */
02805 
02806         newoptions = (options | set) & (~unset);
02807 
02808         /* If the options ended with ')' this is not the start of a nested
02809         group with option changes, so the options change at this level. Compile
02810         code to change the ims options if this setting actually changes any of
02811         them. We also pass the new setting back so that it can be put at the
02812         start of any following branches, and when this group ends (if we are in
02813         a group), a resetting item can be compiled.
02814 
02815         Note that if this item is right at the start of the pattern, the
02816         options will have been abstracted and made global, so there will be no
02817         change to compile. */
02818 
02819         if (*ptr == ')')
02820           {
02821           if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
02822             {
02823             *code++ = OP_OPT;
02824             *code++ = newoptions & PCRE_IMS;
02825             }
02826 
02827           /* Change options at this level, and pass them back for use
02828           in subsequent branches. Reset the greedy defaults and the case
02829           value for firstbyte and reqbyte. */
02830 
02831           *optionsptr = options = newoptions;
02832           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
02833           greedy_non_default = greedy_default ^ 1;
02834           req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
02835 
02836           previous = NULL;       /* This item can't be repeated */
02837           continue;              /* It is complete */
02838           }
02839 
02840         /* If the options ended with ':' we are heading into a nested group
02841         with possible change of options. Such groups are non-capturing and are
02842         not assertions of any kind. All we need to do is skip over the ':';
02843         the newoptions value is handled below. */
02844 
02845         bravalue = OP_BRA;
02846         ptr++;
02847         }
02848       }
02849 
02850     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
02851     non-capturing and behave like (?:...) brackets */
02852 
02853     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
02854       {
02855       bravalue = OP_BRA;
02856       }
02857 
02858     /* Else we have a referencing group; adjust the opcode. If the bracket
02859     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
02860     arrange for the true number to follow later, in an OP_BRANUMBER item. */
02861 
02862     else
02863       {
02864       NUMBERED_GROUP:
02865       if (++(*brackets) > EXTRACT_BASIC_MAX)
02866         {
02867         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
02868         code[1+LINK_SIZE] = OP_BRANUMBER;
02869         PUT2(code, 2+LINK_SIZE, *brackets);
02870         skipbytes = 3;
02871         }
02872       else bravalue = OP_BRA + *brackets;
02873       }
02874 
02875     /* Process nested bracketed re. Assertions may not be repeated, but other
02876     kinds can be. We copy code into a non-register variable in order to be able
02877     to pass its address because some compilers complain otherwise. Pass in a
02878     new setting for the ims options if they have changed. */
02879 
02880     previous = (bravalue >= OP_ONCE)? code : NULL;
02881     *code = bravalue;
02882     tempcode = code;
02883     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
02884 
02885     if (!compile_regex(
02886          newoptions,                   /* The complete new option state */
02887          options & PCRE_IMS,           /* The previous ims option state */
02888          brackets,                     /* Extracting bracket count */
02889          &tempcode,                    /* Where to put code (updated) */
02890          &ptr,                         /* Input pointer (updated) */
02891          errorptr,                     /* Where to put an error message */
02892          (bravalue == OP_ASSERTBACK ||
02893           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
02894          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
02895          &subfirstbyte,                /* For possible first char */
02896          &subreqbyte,                  /* For possible last char */
02897          bcptr,                        /* Current branch chain */
02898          cd))                          /* Tables block */
02899       goto FAILED;
02900 
02901     /* At the end of compiling, code is still pointing to the start of the
02902     group, while tempcode has been updated to point past the end of the group
02903     and any option resetting that may follow it. The pattern pointer (ptr)
02904     is on the bracket. */
02905 
02906     /* If this is a conditional bracket, check that there are no more than
02907     two branches in the group. */
02908 
02909     else if (bravalue == OP_COND)
02910       {
02911       uschar *tc = code;
02912       condcount = 0;
02913 
02914       do {
02915          condcount++;
02916          tc += GET(tc,1);
02917          }
02918       while (*tc != OP_KET);
02919 
02920       if (condcount > 2)
02921         {
02922         *errorptr = ERR27;
02923         goto FAILED;
02924         }
02925 
02926       /* If there is just one branch, we must not make use of its firstbyte or
02927       reqbyte, because this is equivalent to an empty second branch. */
02928 
02929       if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
02930       }
02931 
02932     /* Handle updating of the required and first characters. Update for normal
02933     brackets of all kinds, and conditions with two branches (see code above).
02934     If the bracket is followed by a quantifier with zero repeat, we have to
02935     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
02936     main loop so that they can be accessed for the back off. */
02937 
02938     zeroreqbyte = reqbyte;
02939     zerofirstbyte = firstbyte;
02940     groupsetfirstbyte = FALSE;
02941 
02942     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
02943       {
02944       /* If we have not yet set a firstbyte in this branch, take it from the
02945       subpattern, remembering that it was set here so that a repeat of more
02946       than one can replicate it as reqbyte if necessary. If the subpattern has
02947       no firstbyte, set "none" for the whole branch. In both cases, a zero
02948       repeat forces firstbyte to "none". */
02949 
02950       if (firstbyte == REQ_UNSET)
02951         {
02952         if (subfirstbyte >= 0)
02953           {
02954           firstbyte = subfirstbyte;
02955           groupsetfirstbyte = TRUE;
02956           }
02957         else firstbyte = REQ_NONE;
02958         zerofirstbyte = REQ_NONE;
02959         }
02960 
02961       /* If firstbyte was previously set, convert the subpattern's firstbyte
02962       into reqbyte if there wasn't one, using the vary flag that was in
02963       existence beforehand. */
02964 
02965       else if (subfirstbyte >= 0 && subreqbyte < 0)
02966         subreqbyte = subfirstbyte | tempreqvary;
02967 
02968       /* If the subpattern set a required byte (or set a first byte that isn't
02969       really the first byte - see above), set it. */
02970 
02971       if (subreqbyte >= 0) reqbyte = subreqbyte;
02972       }
02973 
02974     /* For a forward assertion, we take the reqbyte, if set. This can be
02975     helpful if the pattern that follows the assertion doesn't set a different
02976     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
02977     for an assertion, however because it leads to incorrect effect for patterns
02978     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
02979     of a firstbyte. This is overcome by a scan at the end if there's no
02980     firstbyte, looking for an asserted first char. */
02981 
02982     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
02983 
02984     /* Now update the main code pointer to the end of the group. */
02985 
02986     code = tempcode;
02987 
02988     /* Error if hit end of pattern */
02989 
02990     if (*ptr != ')')
02991       {
02992       *errorptr = ERR14;
02993       goto FAILED;
02994       }
02995     break;
02996 
02997     /* Check \ for being a real metacharacter; if not, fall through and handle
02998     it as a data character at the start of a string. Escape items are checked
02999     for validity in the pre-compiling pass. */
03000 
03001     case '\\':
03002     tempptr = ptr;
03003     c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
03004 
03005     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
03006     are arranged to be the negation of the corresponding OP_values. For the
03007     back references, the values are ESC_REF plus the reference number. Only
03008     back references and those types that consume a character may be repeated.
03009     We can test for values between ESC_b and ESC_Z for the latter; this may
03010     have to change if any new ones are ever created. */
03011 
03012     if (c < 0)
03013       {
03014       if (-c == ESC_Q)            /* Handle start of quoted string */
03015         {
03016         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
03017           else inescq = TRUE;
03018         continue;
03019         }
03020 
03021       /* For metasequences that actually match a character, we disable the
03022       setting of a first character if it hasn't already been set. */
03023 
03024       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
03025         firstbyte = REQ_NONE;
03026 
03027       /* Set values to reset to if this is followed by a zero repeat. */
03028 
03029       zerofirstbyte = firstbyte;
03030       zeroreqbyte = reqbyte;
03031 
03032       /* Back references are handled specially */
03033 
03034       if (-c >= ESC_REF)
03035         {
03036         int number = -c - ESC_REF;
03037         previous = code;
03038         *code++ = OP_REF;
03039         PUT2INC(code, 0, number);
03040         }
03041       else
03042         {
03043         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
03044         *code++ = -c;
03045         }
03046       continue;
03047       }
03048 
03049     /* Data character: reset and fall through */
03050 
03051     ptr = tempptr;
03052     c = '\\';
03053 
03054     /* Handle a run of data characters until a metacharacter is encountered.
03055     The first character is guaranteed not to be whitespace or # when the
03056     extended flag is set. */
03057 
03058     NORMAL_CHAR:
03059     default:
03060     previous = code;
03061     *code = OP_CHARS;
03062     code += 2;
03063     length = 0;
03064 
03065     do
03066       {
03067       /* If in \Q...\E, check for the end; if not, we always have a literal */
03068 
03069       if (inescq)
03070         {
03071         if (c == '\\' && ptr[1] == 'E')
03072           {
03073           inescq = FALSE;
03074           ptr++;
03075           }
03076         else
03077           {
03078           *code++ = c;
03079           length++;
03080           }
03081         continue;
03082         }
03083 
03084       /* Skip white space and comments for /x patterns */
03085 
03086       if ((options & PCRE_EXTENDED) != 0)
03087         {
03088         if ((cd->ctypes[c] & ctype_space) != 0) continue;
03089         if (c == '#')
03090           {
03091           /* The space before the ; is to avoid a warning on a silly compiler
03092           on the Macintosh. */
03093           while ((c = *(++ptr)) != 0 && c != NEWLINE) ; /*** FIXME: test LF too? ***/
03094           if (c == 0) break;
03095           continue;
03096           }
03097         }
03098 
03099       /* Backslash may introduce a data char or a metacharacter. Escaped items
03100       are checked for validity in the pre-compiling pass. Stop the string
03101       before a metaitem. */
03102 
03103       if (c == '\\')
03104         {
03105         tempptr = ptr;
03106         c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
03107         if (c < 0) { ptr = tempptr; break; }
03108 
03109         /* If a character is > 127 in UTF-8 mode, we have to turn it into
03110         two or more characters in the UTF-8 encoding. */
03111 
03112 #ifdef SUPPORT_UTF8
03113         if (utf8 && c > 127)
03114           {
03115           uschar buffer[8];
03116           int len = ord2utf8(c, buffer);
03117           for (c = 0; c < len; c++) *code++ = buffer[c];
03118           length += len;
03119           continue;
03120           }
03121 #endif
03122         }
03123 
03124       /* Ordinary character or single-char escape */
03125 
03126       *code++ = c;
03127       length++;
03128       }
03129 
03130     /* This "while" is the end of the "do" above. */
03131 
03132     while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
03133 
03134     /* Update the first and last requirements. These are always bytes, even in
03135     UTF-8 mode. However, there is a special case to be considered when there
03136     are only one or two characters. Because this gets messy in UTF-8 mode, the
03137     code is kept separate. When we get here "length" contains the number of
03138     bytes. */
03139 
03140 #ifdef SUPPORT_UTF8
03141     if (utf8 && length > 1)
03142       {
03143       uschar *t = previous + 3;                      /* After this code, t */
03144       while (t < code && (*t & 0xc0) == 0x80) t++;   /* follows the 1st char */
03145 
03146       /* Handle the case when there is only one multibyte character. It must
03147       have at least two bytes because of the "length > 1" test above. */
03148 
03149       if (t == code)
03150         {
03151         /* If no previous first byte, set it from this character, but revert to
03152         none on a zero repeat. */
03153 
03154         if (firstbyte == REQ_UNSET)
03155           {
03156           zerofirstbyte = REQ_NONE;
03157           firstbyte = previous[2];
03158           }
03159 
03160         /* Otherwise, leave the first byte value alone, and don't change it on
03161         a zero repeat */
03162 
03163         else zerofirstbyte = firstbyte;
03164 
03165         /* In both cases, a zero repeat resets the previous required byte */
03166 
03167         zeroreqbyte = reqbyte;
03168         }
03169 
03170       /* Handle the case when there is more than one character. These may be
03171       single-byte or multibyte characters */
03172 
03173       else
03174         {
03175         t = code - 1;                       /* After this code, t is at the */
03176         while ((*t & 0xc0) == 0x80) t--;    /* start of the last character */
03177 
03178         /* If no previous first byte, set it from the first character, and
03179         retain it on a zero repeat (of the last character). The required byte
03180         is reset on a zero repeat, either to the byte before the last
03181         character, unless this is the first byte of the string. In that case,
03182         it reverts to its previous value. */
03183 
03184         if (firstbyte == REQ_UNSET)
03185           {
03186           zerofirstbyte = firstbyte = previous[2] | req_caseopt;
03187           zeroreqbyte = (t - 1 == previous + 2)?
03188             reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
03189           }
03190 
03191         /* If there was a previous first byte, leave it alone, and don't change
03192         it on a zero repeat. The required byte is reset on a zero repeat to the
03193         byte before the last character. */
03194 
03195         else
03196           {
03197           zerofirstbyte = firstbyte;
03198           zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
03199           }
03200         }
03201 
03202       /* In all cases (we know length > 1), the new required byte is the last
03203       byte of the string. */
03204 
03205       reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
03206       }
03207 
03208     else   /* End of UTF-8 coding */
03209 #endif
03210 
03211     /* This is the code for non-UTF-8 operation, either without UTF-8 support,
03212     or when UTF-8 is not enabled. */
03213 
03214       {
03215       /* firstbyte was not previously set; take it from this string */
03216 
03217       if (firstbyte == REQ_UNSET)
03218         {
03219         if (length == 1)
03220           {
03221           zerofirstbyte = REQ_NONE;
03222           firstbyte = previous[2] | req_caseopt;
03223           zeroreqbyte = reqbyte;
03224           }
03225         else
03226           {
03227           zerofirstbyte = firstbyte = previous[2] | req_caseopt;
03228           zeroreqbyte = (length > 2)?
03229             (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
03230           reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
03231           }
03232         }
03233 
03234       /* firstbyte was previously set */
03235 
03236       else
03237         {
03238         zerofirstbyte = firstbyte;
03239         zeroreqbyte = (length == 1)? reqbyte :
03240           code[-2] | req_caseopt | cd->req_varyopt;
03241         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
03242         }
03243       }
03244 
03245     /* Set the length in the data vector, and advance to the next state. */
03246 
03247     previous[1] = length;
03248     if (length < MAXLIT) ptr--;
03249     break;
03250     }
03251   }                   /* end of big loop */
03252 
03253 /* Control never reaches here by falling through, only by a goto for all the
03254 error states. Pass back the position in the pattern so that it can be displayed
03255 to the user for diagnosing the error. */
03256 
03257 FAILED:
03258 *ptrptr = ptr;
03259 return FALSE;
03260 }
03261 
03262 
03263 
03264 
03265 /*************************************************
03266 *     Compile sequence of alternatives           *
03267 *************************************************/
03268 
03269 /* On entry, ptr is pointing past the bracket character, but on return
03270 it points to the closing bracket, or vertical bar, or end of string.
03271 The code variable is pointing at the byte into which the BRA operator has been
03272 stored. If the ims options are changed at the start (for a (?ims: group) or
03273 during any branch, we need to insert an OP_OPT item at the start of every
03274 following branch to ensure they get set correctly at run time, and also pass
03275 the new options into every subsequent branch compile.
03276 
03277 Argument:
03278   options        option bits, including any changes for this subpattern
03279   oldims         previous settings of ims option bits
03280   brackets       -> int containing the number of extracting brackets used
03281   codeptr        -> the address of the current code pointer
03282   ptrptr         -> the address of the current pattern pointer
03283   errorptr       -> pointer to error message
03284   lookbehind     TRUE if this is a lookbehind assertion
03285   skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
03286   firstbyteptr   place to put the first required character, or a negative number
03287   reqbyteptr     place to put the last required character, or a negative number
03288   bcptr          pointer to the chain of currently open branches
03289   cd             points to the data block with tables pointers etc.
03290 
03291 Returns:      TRUE on success
03292 */
03293 
03294 static BOOL
03295 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
03296   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
03297   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
03298 {
03299 const uschar *ptr = *ptrptr;
03300 uschar *code = *codeptr;
03301 uschar *last_branch = code;
03302 uschar *start_bracket = code;
03303 uschar *reverse_count = NULL;
03304 int firstbyte, reqbyte;
03305 int branchfirstbyte, branchreqbyte;
03306 branch_chain bc;
03307 
03308 bc.outer = bcptr;
03309 bc.current = code;
03310 
03311 firstbyte = reqbyte = REQ_UNSET;
03312 
03313 /* Offset is set zero to mark that this bracket is still open */
03314 
03315 PUT(code, 1, 0);
03316 code += 1 + LINK_SIZE + skipbytes;
03317 
03318 /* Loop for each alternative branch */
03319 
03320 for (;;)
03321   {
03322   /* Handle a change of ims options at the start of the branch */
03323 
03324   if ((options & PCRE_IMS) != oldims)
03325     {
03326     *code++ = OP_OPT;
03327     *code++ = options & PCRE_IMS;
03328     }
03329 
03330   /* Set up dummy OP_REVERSE if lookbehind assertion */
03331 
03332   if (lookbehind)
03333     {
03334     *code++ = OP_REVERSE;
03335     reverse_count = code;
03336     PUTINC(code, 0, 0);
03337     }
03338 
03339   /* Now compile the branch */
03340 
03341   if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
03342         &branchfirstbyte, &branchreqbyte, &bc, cd))
03343     {
03344     *ptrptr = ptr;
03345     return FALSE;
03346     }
03347 
03348   /* If this is the first branch, the firstbyte and reqbyte values for the
03349   branch become the values for the regex. */
03350 
03351   if (*last_branch != OP_ALT)
03352     {
03353     firstbyte = branchfirstbyte;
03354     reqbyte = branchreqbyte;
03355     }
03356 
03357   /* If this is not the first branch, the first char and reqbyte have to
03358   match the values from all the previous branches, except that if the previous
03359   value for reqbyte didn't have REQ_VARY set, it can still match, and we set
03360   REQ_VARY for the regex. */
03361 
03362   else
03363     {
03364     /* If we previously had a firstbyte, but it doesn't match the new branch,
03365     we have to abandon the firstbyte for the regex, but if there was previously
03366     no reqbyte, it takes on the value of the old firstbyte. */
03367 
03368     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
03369       {
03370       if (reqbyte < 0) reqbyte = firstbyte;
03371       firstbyte = REQ_NONE;
03372       }
03373 
03374     /* If we (now or from before) have no firstbyte, a firstbyte from the
03375     branch becomes a reqbyte if there isn't a branch reqbyte. */
03376 
03377     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
03378         branchreqbyte = branchfirstbyte;
03379 
03380     /* Now ensure that the reqbytes match */
03381 
03382     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
03383       reqbyte = REQ_NONE;
03384     else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
03385     }
03386 
03387   /* If lookbehind, check that this branch matches a fixed-length string,
03388   and put the length into the OP_REVERSE item. Temporarily mark the end of
03389   the branch with OP_END. */
03390 
03391   if (lookbehind)
03392     {
03393     int length;
03394     *code = OP_END;
03395     length = find_fixedlength(last_branch, options);
03396     DPRINTF(("fixed length = %d\n", length));
03397     if (length < 0)
03398       {
03399       *errorptr = (length == -2)? ERR36 : ERR25;
03400       *ptrptr = ptr;
03401       return FALSE;
03402       }
03403     PUT(reverse_count, 0, length);
03404     }
03405 
03406   /* Reached end of expression, either ')' or end of pattern. Go back through
03407   the alternative branches and reverse the chain of offsets, with the field in
03408   the BRA item now becoming an offset to the first alternative. If there are
03409   no alternatives, it points to the end of the group. The length in the
03410   terminating ket is always the length of the whole bracketed item. If any of
03411   the ims options were changed inside the group, compile a resetting op-code
03412   following, except at the very end of the pattern. Return leaving the pointer
03413   at the terminating char. */
03414 
03415   if (*ptr != '|')
03416     {
03417     int length = code - last_branch;
03418     do
03419       {
03420       int prev_length = GET(last_branch, 1);
03421       PUT(last_branch, 1, length);
03422       length = prev_length;
03423       last_branch -= length;
03424       }
03425     while (length > 0);
03426 
03427     /* Fill in the ket */
03428 
03429     *code = OP_KET;
03430     PUT(code, 1, code - start_bracket);
03431     code += 1 + LINK_SIZE;
03432 
03433     /* Resetting option if needed */
03434 
03435     if ((options & PCRE_IMS) != oldims && *ptr == ')')
03436       {
03437       *code++ = OP_OPT;
03438       *code++ = oldims;
03439       }
03440 
03441     /* Set values to pass back */
03442 
03443     *codeptr = code;
03444     *ptrptr = ptr;
03445     *firstbyteptr = firstbyte;
03446     *reqbyteptr = reqbyte;
03447     return TRUE;
03448     }
03449 
03450   /* Another branch follows; insert an "or" node. Its length field points back
03451   to the previous branch while the bracket remains open. At the end the chain
03452   is reversed. It's done like this so that the start of the bracket has a
03453   zero offset until it is closed, making it possible to detect recursion. */
03454 
03455   *code = OP_ALT;
03456   PUT(code, 1, code - last_branch);
03457   bc.current = last_branch = code;
03458   code += 1 + LINK_SIZE;
03459   ptr++;
03460   }
03461 /* Control never reaches here */
03462 }
03463 
03464 
03465 
03466 
03467 /*************************************************
03468 *          Check for anchored expression         *
03469 *************************************************/
03470 
03471 /* Try to find out if this is an anchored regular expression. Consider each
03472 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
03473 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
03474 it's anchored. However, if this is a multiline pattern, then only OP_SOD
03475 counts, since OP_CIRC can match in the middle.
03476 
03477 We can also consider a regex to be anchored if OP_SOM starts all its branches.
03478 This is the code for \G, which means "match at start of match position, taking
03479 into account the match offset".
03480 
03481 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
03482 because that will try the rest of the pattern at all possible matching points,
03483 so there is no point trying again.... er ....
03484 
03485 .... except when the .* appears inside capturing parentheses, and there is a
03486 subsequent back reference to those parentheses. We haven't enough information
03487 to catch that case precisely.
03488 
03489 At first, the best we could do was to detect when .* was in capturing brackets
03490 and the highest back reference was greater than or equal to that level.
03491 However, by keeping a bitmap of the first 31 back references, we can catch some
03492 of the more common cases more precisely.
03493 
03494 Arguments:
03495   code           points to start of expression (the bracket)
03496   options        points to the options setting
03497   bracket_map    a bitmap of which brackets we are inside while testing; this
03498                   handles up to substring 31; after that we just have to take
03499                   the less precise approach
03500   backref_map    the back reference bitmap
03501 
03502 Returns:     TRUE or FALSE
03503 */
03504 
03505 static BOOL
03506 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
03507   unsigned int backref_map)
03508 {
03509 do {
03510    const uschar *scode =
03511      first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
03512    register int op = *scode;
03513 
03514    /* Capturing brackets */
03515 
03516    if (op > OP_BRA)
03517      {
03518      int new_map;
03519      op -= OP_BRA;
03520      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
03521      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
03522      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
03523      }
03524 
03525    /* Other brackets */
03526 
03527    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
03528      {
03529      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
03530      }
03531 
03532    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
03533    are or may be referenced. */
03534 
03535    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
03536             (*options & PCRE_DOTALL) != 0)
03537      {
03538      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
03539      }
03540 
03541    /* Check for explicit anchoring */
03542 
03543    else if (op != OP_SOD && op != OP_SOM &&
03544            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
03545      return FALSE;
03546    code += GET(code, 1);
03547    }
03548 while (*code == OP_ALT);   /* Loop for each alternative */
03549 return TRUE;
03550 }
03551 
03552 
03553 
03554 /*************************************************
03555 *         Check for starting with ^ or .*        *
03556 *************************************************/
03557 
03558 /* This is called to find out if every branch starts with ^ or .* so that
03559 "first char" processing can be done to speed things up in multiline
03560 matching and for non-DOTALL patterns that start with .* (which must start at
03561 the beginning or after \n). As in the case of is_anchored() (see above), we
03562 have to take account of back references to capturing brackets that contain .*
03563 because in that case we can't make the assumption.
03564 
03565 Arguments:
03566   code           points to start of expression (the bracket)
03567   bracket_map    a bitmap of which brackets we are inside while testing; this
03568                   handles up to substring 31; after that we just have to take
03569                   the less precise approach
03570   backref_map    the back reference bitmap
03571 
03572 Returns:         TRUE or FALSE
03573 */
03574 
03575 static BOOL
03576 is_startline(const uschar *code, unsigned int bracket_map,
03577   unsigned int backref_map)
03578 {
03579 do {
03580    const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
03581    register int op = *scode;
03582 
03583    /* Capturing brackets */
03584 
03585    if (op > OP_BRA)
03586      {
03587      int new_map;
03588      op -= OP_BRA;
03589      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
03590      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
03591      if (!is_startline(scode, new_map, backref_map)) return FALSE;
03592      }
03593 
03594    /* Other brackets */
03595 
03596    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
03597      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
03598 
03599    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
03600    may be referenced. */
03601 
03602    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
03603      {
03604      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
03605      }
03606 
03607    /* Check for explicit circumflex */
03608 
03609    else if (op != OP_CIRC) return FALSE;
03610    code += GET(code, 1);
03611    }
03612 while (*code == OP_ALT);  /* Loop for each alternative */
03613 return TRUE;
03614 }
03615 
03616 
03617 
03618 /*************************************************
03619 *       Check for asserted fixed first char      *
03620 *************************************************/
03621 
03622 /* During compilation, the "first char" settings from forward assertions are
03623 discarded, because they can cause conflicts with actual literals that follow.
03624 However, if we end up without a first char setting for an unanchored pattern,
03625 it is worth scanning the regex to see if there is an initial asserted first
03626 char. If all branches start with the same asserted char, or with a bracket all
03627 of whose alternatives start with the same asserted char (recurse ad lib), then
03628 we return that char, otherwise -1.
03629 
03630 Arguments:
03631   code       points to start of expression (the bracket)
03632   options    pointer to the options (used to check casing changes)
03633   inassert   TRUE if in an assertion
03634 
03635 Returns:     -1 or the fixed first char
03636 */
03637 
03638 static int
03639 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
03640 {
03641 register int c = -1;
03642 do {
03643    int d;
03644    const uschar *scode =
03645      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
03646    register int op = *scode;
03647 
03648    if (op >= OP_BRA) op = OP_BRA;
03649 
03650    switch(op)
03651      {
03652      default:
03653      return -1;
03654 
03655      case OP_BRA:
03656      case OP_ASSERT:
03657      case OP_ONCE:
03658      case OP_COND:
03659      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
03660        return -1;
03661      if (c < 0) c = d; else if (c != d) return -1;
03662      break;
03663 
03664      case OP_EXACT:       /* Fall through */
03665      scode++;
03666 
03667      case OP_CHARS:       /* Fall through */
03668      scode++;
03669 
03670      case OP_PLUS:
03671      case OP_MINPLUS:
03672      if (!inassert) return -1;
03673      if (c < 0)
03674        {
03675        c = scode[1];
03676        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
03677        }
03678      else if (c != scode[1]) return -1;
03679      break;
03680      }
03681 
03682    code += GET(code, 1);
03683    }
03684 while (*code == OP_ALT);
03685 return c;
03686 }
03687 
03688 
03689 
03690 
03691 /*************************************************
03692 *        Compile a Regular Expression            *
03693 *************************************************/
03694 
03695 /* This function takes a string and returns a pointer to a block of store
03696 holding a compiled version of the expression.
03697 
03698 Arguments:
03699   pattern      the regular expression
03700   options      various option bits
03701   errorptr     pointer to pointer to error text
03702   erroroffset  ptr offset in pattern where error was detected
03703   tables       pointer to character tables or NULL
03704 
03705 Returns:       pointer to compiled data block, or NULL on error,
03706                with errorptr and erroroffset set
03707 */
03708 
03709 pcre *
03710 pcre_compile(const char *pattern, int options, const char **errorptr,
03711   int *erroroffset, const unsigned char *tables)
03712 {
03713 real_pcre *re;
03714 int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
03715 int runlength;
03716 int c, firstbyte, reqbyte;
03717 int bracount = 0;
03718 int branch_extra = 0;
03719 int branch_newextra;
03720 int item_count = -1;
03721 int name_count = 0;
03722 int max_name_size = 0;
03723 #ifdef SUPPORT_UTF8
03724 int lastcharlength = 0;
03725 BOOL utf8;
03726 BOOL class_utf8;
03727 #endif
03728 BOOL inescq = FALSE;
03729 unsigned int brastackptr = 0;
03730 size_t size;
03731 uschar *code;
03732 const uschar *codestart;
03733 const uschar *ptr;
03734 compile_data compile_block;
03735 int brastack[BRASTACK_SIZE];
03736 uschar bralenstack[BRASTACK_SIZE];
03737 
03738 /* We can't pass back an error message if errorptr is NULL; I guess the best we
03739 can do is just return NULL. */
03740 
03741 if (errorptr == NULL) return NULL;
03742 *errorptr = NULL;
03743 
03744 /* However, we can give a message for this error */
03745 
03746 if (erroroffset == NULL)
03747   {
03748   *errorptr = ERR16;
03749   return NULL;
03750   }
03751 *erroroffset = 0;
03752 
03753 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
03754 
03755 #ifdef SUPPORT_UTF8
03756 utf8 = (options & PCRE_UTF8) != 0;
03757 #else
03758 if ((options & PCRE_UTF8) != 0)
03759   {
03760   *errorptr = ERR32;
03761   return NULL;
03762   }
03763 #endif
03764 
03765 if ((options & ~PUBLIC_OPTIONS) != 0)
03766   {
03767   *errorptr = ERR17;
03768   return NULL;
03769   }
03770 
03771 /* Set up pointers to the individual character tables */
03772 
03773 if (tables == NULL) tables = pcre_default_tables;
03774 compile_block.lcc = tables + lcc_offset;
03775 compile_block.fcc = tables + fcc_offset;
03776 compile_block.cbits = tables + cbits_offset;
03777 compile_block.ctypes = tables + ctypes_offset;
03778 
03779 /* Maximum back reference and backref bitmap. This is updated for numeric
03780 references during the first pass, but for named references during the actual
03781 compile pass. The bitmap records up to 31 back references to help in deciding
03782 whether (.*) can be treated as anchored or not. */
03783 
03784 compile_block.top_backref = 0;
03785 compile_block.backref_map = 0;
03786 
03787 /* Reflect pattern for debugging output */
03788 
03789 DPRINTF(("------------------------------------------------------------------\n"));
03790 DPRINTF(("%s\n", pattern));
03791 
03792 /* The first thing to do is to make a pass over the pattern to compute the
03793 amount of store required to hold the compiled code. This does not have to be
03794 perfect as long as errors are overestimates. At the same time we can detect any
03795 flag settings right at the start, and extract them. Make an attempt to correct
03796 for any counted white space if an "extended" flag setting appears late in the
03797 pattern. We can't be so clever for #-comments. */
03798 
03799 ptr = (const uschar *)(pattern - 1);
03800 while ((c = *(++ptr)) != 0)
03801   {
03802   int min, max;
03803   int class_optcount;
03804   int bracket_length;
03805   int duplength;
03806 
03807   /* If we are inside a \Q...\E sequence, all chars are literal */
03808 
03809   if (inescq) goto NORMAL_CHAR;
03810 
03811   /* Otherwise, first check for ignored whitespace and comments */
03812 
03813   if ((options & PCRE_EXTENDED) != 0)
03814     {
03815     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
03816     if (c == '#')
03817       {
03818       /* The space before the ; is to avoid a warning on a silly compiler
03819       on the Macintosh. */
03820       while ((c = *(++ptr)) != 0 && c != NEWLINE) ; /*** FIXME: test LF too? ***/
03821       if (c == 0) break;
03822       continue;
03823       }
03824     }
03825 
03826   item_count++;    /* Is zero for the first non-comment item */
03827 
03828   switch(c)
03829     {
03830     /* A backslashed item may be an escaped "normal" character or a
03831     character type. For a "normal" character, put the pointers and
03832     character back so that tests for whitespace etc. in the input
03833     are done correctly. */
03834 
03835     case '\\':
03836       {
03837       const uschar *save_ptr = ptr;
03838       c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
03839       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
03840       if (c >= 0)
03841         {
03842         ptr = save_ptr;
03843         c = '\\';
03844         goto NORMAL_CHAR;
03845         }
03846       }
03847 
03848     /* If \Q, enter "literal" mode */
03849 
03850     if (-c == ESC_Q)
03851       {
03852       inescq = TRUE;
03853       continue;
03854       }
03855 
03856     /* Other escapes need one byte, and are of length one for repeats */
03857 
03858     length++;
03859 #ifdef SUPPORT_UTF8
03860     lastcharlength = 1;
03861 #endif
03862 
03863     /* A back reference needs an additional 2 bytes, plus either one or 5
03864     bytes for a repeat. We also need to keep the value of the highest
03865     back reference. */
03866 
03867     if (c <= -ESC_REF)
03868       {
03869       int refnum = -c - ESC_REF;
03870       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
03871       if (refnum > compile_block.top_backref)
03872         compile_block.top_backref = refnum;
03873       length += 2;   /* For single back reference */
03874       if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
03875         {
03876         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
03877         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
03878         if ((min == 0 && (max == 1 || max == -1)) ||
03879           (min == 1 && max == -1))
03880             length++;
03881         else length += 5;
03882         if (ptr[1] == '?') ptr++;
03883         }
03884       }
03885     continue;
03886 
03887     case '^':     /* Single-byte metacharacters */
03888     case '.':
03889     case '$':
03890     length++;
03891 #ifdef SUPPORT_UTF8
03892     lastcharlength = 1;
03893 #endif
03894     continue;
03895 
03896     case '*':            /* These repeats won't be after brackets; */
03897     case '+':            /* those are handled separately */
03898     case '?':
03899     length++;
03900     goto POSESSIVE;      /* A few lines below */
03901 
03902     /* This covers the cases of braced repeats after a single char, metachar,
03903     class, or back reference. */
03904 
03905     case '{':
03906     if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
03907     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
03908     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
03909 
03910     /* These special cases just insert one extra opcode */
03911 
03912     if ((min == 0 && (max == 1 || max == -1)) ||
03913       (min == 1 && max == -1))
03914         length++;
03915 
03916     /* These cases might insert additional copies of a preceding character. */
03917 
03918     else
03919       {
03920 #ifdef SUPPORT_UTF8
03921       /* In UTF-8 mode, we should find the length in lastcharlength */
03922       if (utf8)
03923         {
03924         if (min != 1)
03925           {
03926           length -= lastcharlength;   /* Uncount the original char or metachar */
03927           if (min > 0) length += 3 + lastcharlength;
03928           }
03929         length += lastcharlength + ((max > 0)? 3 : 1);
03930         }
03931       else
03932 #endif
03933 
03934       /* Not UTF-8 mode: all characters are one byte */
03935         {
03936         if (min != 1)
03937           {
03938           length--;   /* Uncount the original char or metachar */
03939           if (min > 0) length += 4;
03940           }
03941 
03942         length += (max > 0)? 4 : 2;
03943         }
03944       }
03945 
03946     if (ptr[1] == '?') ptr++;      /* Needs no extra length */
03947 
03948     POSESSIVE:                     /* Test for possessive quantifier */
03949     if (ptr[1] == '+')
03950       {
03951       ptr++;
03952       length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
03953       }
03954     continue;
03955 
03956     /* An alternation contains an offset to the next branch or ket. If any ims
03957     options changed in the previous branch(es), and/or if we are in a
03958     lookbehind assertion, extra space will be needed at the start of the
03959     branch. This is handled by branch_extra. */
03960 
03961     case '|':
03962     length += 1 + LINK_SIZE + branch_extra;
03963     continue;
03964 
03965     /* A character class uses 33 characters provided that all the character
03966     values are less than 256. Otherwise, it uses a bit map for low valued
03967     characters, and individual items for others. Don't worry about character
03968     types that aren't allowed in classes - they'll get picked up during the
03969     compile. A character class that contains only one single-byte character
03970     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
03971     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
03972 
03973     case '[':
03974     class_optcount = 0;
03975 
03976 #ifdef SUPPORT_UTF8
03977     class_utf8 = FALSE;
03978 #endif
03979 
03980     if (*(++ptr) == '^') ptr++;
03981 
03982     /* Written as a "do" so that an initial ']' is taken as data */
03983 
03984     if (*ptr != 0) do
03985       {
03986       /* Inside \Q...\E everything is literal except \E */
03987 
03988       if (inescq)
03989         {
03990         if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
03991         inescq = FALSE;
03992         ptr += 1;
03993         continue;
03994         }
03995 
03996       /* Outside \Q...\E, check for escapes */
03997 
03998       if (*ptr == '\\')
03999         {
04000 #ifdef SUPPORT_UTF8
04001         int prevchar = ptr[-1];
04002 #endif
04003         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
04004           &compile_block);
04005         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04006 
04007         /* \b is backspace inside a class */
04008 
04009         if (-ch == ESC_b) ch = '\b';
04010 
04011         /* \Q enters quoting mode */
04012 
04013         if (-ch == ESC_Q)
04014           {
04015           inescq = TRUE;
04016           continue;
04017           }
04018 
04019         /* Handle escapes that turn into characters */
04020 
04021         if (ch >= 0)
04022           {
04023 #ifdef SUPPORT_UTF8
04024           if (utf8)
04025             {
04026             if (ch > 127) class_optcount = 10;  /* Ensure > 1 */
04027             if (ch > 255)
04028               {
04029               uschar buffer[6];
04030               if (!class_utf8)
04031                 {
04032                 class_utf8 = TRUE;
04033                 length += LINK_SIZE + 1 + 1;
04034                 }
04035               length += 1 + ord2utf8(ch, buffer);
04036 
04037               /* If this wide character is preceded by '-', add an extra 2 to
04038               the length in case the previous character was < 128, because in
04039               this case the whole range will be put into the list. */
04040 
04041               if (prevchar == '-') length += 2;
04042               }
04043             }
04044 #endif
04045           class_optcount++;            /* for possible optimization */
04046           }
04047         else class_optcount = 10;      /* \d, \s etc; make sure > 1 */
04048         }
04049 
04050       /* Check the syntax for POSIX stuff. The bits we actually handle are
04051       checked during the real compile phase. */
04052 
04053       else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
04054         {
04055         ptr++;
04056         class_optcount = 10;    /* Make sure > 1 */
04057         }
04058 
04059       /* Anything else just increments the possible optimization count. If
04060       there are wide characters, we are going to have to use an XCLASS. */
04061 
04062       else
04063         {
04064         NON_SPECIAL_CHARACTER:
04065         class_optcount++;
04066 
04067 #ifdef SUPPORT_UTF8
04068         if (utf8)
04069           {
04070           int ch;
04071           int extra = 0;
04072           GETCHARLEN(ch, ptr, extra);
04073           if (ch > 127) class_optcount = 10;   /* No optimization possible */
04074           if (ch > 255)
04075             {
04076             if (!class_utf8)
04077               {
04078               class_utf8 = TRUE;
04079               length += LINK_SIZE + 1 + 1;
04080               }
04081             length += 2 + extra;
04082 
04083             /* If this wide character is preceded by '-', add an extra 2 to
04084             the length in case the previous character was < 128, because in
04085             this case the whole range will be put into the list. */
04086 
04087             if (ptr[-1] == '-') length += 2;
04088 
04089             /* Advance to the end of this character */
04090 
04091             ptr += extra;
04092             }
04093           }
04094 #endif
04095         }
04096       }
04097     while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
04098 
04099     if (*ptr == 0)                          /* Missing terminating ']' */
04100       {
04101       *errorptr = ERR6;
04102       goto PCRE_ERROR_RETURN;
04103       }
04104 
04105     /* We can optimize when there was only one optimizable character. Repeats
04106     for positive and negated single one-byte chars are handled by the general
04107     code. Here, we handle repeats for the class opcodes. */
04108 
04109     if (class_optcount == 1) length += 3; else
04110       {
04111       length += 33;
04112 
04113       /* A repeat needs either 1 or 5 bytes. */
04114 
04115       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
04116         {
04117         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
04118         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04119         if ((min == 0 && (max == 1 || max == -1)) ||
04120           (min == 1 && max == -1))
04121             length++;
04122         else length += 5;
04123         if (ptr[1] == '?') ptr++;
04124         }
04125       }
04126     continue;
04127 
04128     /* Brackets may be genuine groups or special things */
04129 
04130     case '(':
04131     branch_newextra = 0;
04132     bracket_length = 1 + LINK_SIZE;
04133 
04134     /* Handle special forms of bracket, which all start (? */
04135 
04136     if (ptr[1] == '?')
04137       {
04138       int set, unset;
04139       int *optset;
04140 
04141       switch (c = ptr[2])
04142         {
04143         /* Skip over comments entirely */
04144         case '#':
04145         ptr += 3;
04146         while (*ptr != 0 && *ptr != ')') ptr++;
04147         if (*ptr == 0)
04148           {
04149           *errorptr = ERR18;
04150           goto PCRE_ERROR_RETURN;
04151           }
04152         continue;
04153 
04154         /* Non-referencing groups and lookaheads just move the pointer on, and
04155         then behave like a non-special bracket, except that they don't increment
04156         the count of extracting brackets. Ditto for the "once only" bracket,
04157         which is in Perl from version 5.005. */
04158 
04159         case ':':
04160         case '=':
04161         case '!':
04162         case '>':
04163         ptr += 2;
04164         break;
04165 
04166         /* (?R) specifies a recursive call to the regex, which is an extension
04167         to provide the facility which can be obtained by (?p{perl-code}) in
04168         Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
04169 
04170         From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
04171         the appropriate numbered brackets. This includes both recursive and
04172         non-recursive calls. (?R) is now synonymous with (?0). */
04173 
04174         case 'R':
04175         ptr++;
04176 
04177         case '0': case '1': case '2': case '3': case '4':
04178         case '5': case '6': case '7': case '8': case '9':
04179         ptr += 2;
04180         if (c != 'R')
04181           while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0) ;
04182         if (*ptr != ')')
04183           {
04184           *errorptr = ERR29;
04185           goto PCRE_ERROR_RETURN;
04186           }
04187         length += 1 + LINK_SIZE;
04188 
04189         /* If this item is quantified, it will get wrapped inside brackets so
04190         as to use the code for quantified brackets. We jump down and use the
04191         code that handles this for real brackets. */
04192 
04193         if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
04194           {
04195           length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
04196           duplength = 5 + 3 * LINK_SIZE;
04197           goto HANDLE_QUANTIFIED_BRACKETS;
04198           }
04199         continue;
04200 
04201         /* (?C) is an extension which provides "callout" - to provide a bit of
04202         the functionality of the Perl (?{...}) feature. An optional number may
04203         follow (default is zero). */
04204 
04205         case 'C':
04206         ptr += 2;
04207         while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0) ;
04208         if (*ptr != ')')
04209           {
04210           *errorptr = ERR39;
04211           goto PCRE_ERROR_RETURN;
04212           }
04213         length += 2;
04214         continue;
04215 
04216         /* Named subpatterns are an extension copied from Python */
04217 
04218         case 'P':
04219         ptr += 3;
04220         if (*ptr == '<')
04221           {
04222           const uschar *p;    /* Don't amalgamate; some compilers */
04223           p = ++ptr;          /* grumble at autoincrement in declaration */
04224           while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
04225           if (*ptr != '>')
04226             {
04227             *errorptr = ERR42;
04228             goto PCRE_ERROR_RETURN;
04229             }
04230           name_count++;
04231           if (ptr - p > max_name_size) max_name_size = (ptr - p);
04232           break;
04233           }
04234 
04235         if (*ptr == '=' || *ptr == '>')
04236           {
04237           while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0) ;
04238           if (*ptr != ')')
04239             {
04240             *errorptr = ERR42;
04241             goto PCRE_ERROR_RETURN;
04242             }
04243           break;
04244           }
04245 
04246         /* Unknown character after (?P */
04247 
04248         *errorptr = ERR41;
04249         goto PCRE_ERROR_RETURN;
04250 
04251         /* Lookbehinds are in Perl from version 5.005 */
04252 
04253         case '<':
04254         ptr += 3;
04255         if (*ptr == '=' || *ptr == '!')
04256           {
04257           branch_newextra = 1 + LINK_SIZE;
04258           length += 1 + LINK_SIZE;         /* For the first branch */
04259           break;
04260           }
04261         *errorptr = ERR24;
04262         goto PCRE_ERROR_RETURN;
04263 
04264         /* Conditionals are in Perl from version 5.005. The bracket must either
04265         be followed by a number (for bracket reference) or by an assertion
04266         group, or (a PCRE extension) by 'R' for a recursion test. */
04267 
04268         case '(':
04269         if (ptr[3] == 'R' && ptr[4] == ')')
04270           {
04271           ptr += 4;
04272           length += 3;
04273           }
04274         else if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
04275           {
04276           ptr += 4;
04277           length += 3;
04278           while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
04279           if (*ptr != ')')
04280             {
04281             *errorptr = ERR26;
04282             goto PCRE_ERROR_RETURN;
04283             }
04284           }
04285         else   /* An assertion must follow */
04286           {
04287           ptr++;   /* Can treat like ':' as far as spacing is concerned */
04288           if (ptr[2] != '?' ||
04289              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
04290             {
04291             ptr += 2;    /* To get right offset in message */
04292             *errorptr = ERR28;
04293             goto PCRE_ERROR_RETURN;
04294             }
04295           }
04296         break;
04297 
04298         /* Else loop checking valid options until ) is met. Anything else is an
04299         error. If we are without any brackets, i.e. at top level, the settings
04300         act as if specified in the options, so massage the options immediately.
04301         This is for backward compatibility with Perl 5.004. */
04302 
04303         default:
04304         set = unset = 0;
04305         optset = &set;
04306         ptr += 2;
04307 
04308         for (;; ptr++)
04309           {
04310           c = *ptr;
04311           switch (c)
04312             {
04313             case 'i':
04314             *optset |= PCRE_CASELESS;
04315             continue;
04316 
04317             case 'm':
04318             *optset |= PCRE_MULTILINE;
04319             continue;
04320 
04321             case 's':
04322             *optset |= PCRE_DOTALL;
04323             continue;
04324 
04325             case 'x':
04326             *optset |= PCRE_EXTENDED;
04327             continue;
04328 
04329             case 'X':
04330             *optset |= PCRE_EXTRA;
04331             continue;
04332 
04333             case 'U':
04334             *optset |= PCRE_UNGREEDY;
04335             continue;
04336 
04337             case '-':
04338             optset = &unset;
04339             continue;
04340 
04341             /* A termination by ')' indicates an options-setting-only item; if
04342             this is at the very start of the pattern (indicated by item_count
04343             being zero), we use it to set the global options. This is helpful
04344             when analyzing the pattern for first characters, etc. Otherwise
04345             nothing is done here and it is handled during the compiling
04346             process.
04347 
04348             [Historical note: Up to Perl 5.8, options settings at top level
04349             were always global settings, wherever they appeared in the pattern.
04350             That is, they were equivalent to an external setting. From 5.8
04351             onwards, they apply only to what follows (which is what you might
04352             expect).] */
04353 
04354             case ')':
04355             if (item_count == 0)
04356               {
04357               options = (options | set) & (~unset);
04358               set = unset = 0;     /* To save length */
04359               item_count--;        /* To allow for several */
04360               }
04361 
04362             /* Fall through */
04363 
04364             /* A termination by ':' indicates the start of a nested group with
04365             the given options set. This is again handled at compile time, but
04366             we must allow for compiled space if any of the ims options are
04367             set. We also have to allow for resetting space at the end of
04368             the group, which is why 4 is added to the length and not just 2.
04369             If there are several changes of options within the same group, this
04370             will lead to an over-estimate on the length, but this shouldn't
04371             matter very much. We also have to allow for resetting options at
04372             the start of any alternations, which we do by setting
04373             branch_newextra to 2. Finally, we record whether the case-dependent
04374             flag ever changes within the regex. This is used by the "required
04375             character" code. */
04376 
04377             case ':':
04378             if (((set|unset) & PCRE_IMS) != 0)
04379               {
04380               length += 4;
04381               branch_newextra = 2;
04382               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
04383               }
04384             goto END_OPTIONS;
04385 
04386             /* Unrecognized option character */
04387 
04388             default:
04389             *errorptr = ERR12;
04390             goto PCRE_ERROR_RETURN;
04391             }
04392           }
04393 
04394         /* If we hit a closing bracket, that's it - this is a freestanding
04395         option-setting. We need to ensure that branch_extra is updated if
04396         necessary. The only values branch_newextra can have here are 0 or 2.
04397         If the value is 2, then branch_extra must either be 2 or 5, depending
04398         on whether this is a lookbehind group or not. */
04399 
04400         END_OPTIONS:
04401         if (c == ')')
04402           {
04403           if (branch_newextra == 2 &&
04404               (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
04405             branch_extra += branch_newextra;
04406           continue;
04407           }
04408 
04409         /* If options were terminated by ':' control comes here. Fall through
04410         to handle the group below. */
04411         }
04412       }
04413 
04414     /* Extracting brackets must be counted so we can process escapes in a
04415     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
04416     need an additional 3 bytes of store per extracting bracket. However, if
04417     PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
04418     must leave the count alone (it will aways be zero). */
04419 
04420     else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
04421       {
04422       bracount++;
04423       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
04424       }
04425 
04426     /* Save length for computing whole length at end if there's a repeat that
04427     requires duplication of the group. Also save the current value of
04428     branch_extra, and start the new group with the new value. If non-zero, this
04429     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
04430 
04431     if (brastackptr >= sizeof(brastack)/sizeof(int))
04432       {
04433       *errorptr = ERR19;
04434       goto PCRE_ERROR_RETURN;
04435       }
04436 
04437     bralenstack[brastackptr] = branch_extra;
04438     branch_extra = branch_newextra;
04439 
04440     brastack[brastackptr++] = length;
04441     length += bracket_length;
04442     continue;
04443 
04444     /* Handle ket. Look for subsequent max/min; for certain sets of values we
04445     have to replicate this bracket up to that many times. If brastackptr is
04446     0 this is an unmatched bracket which will generate an error, but take care
04447     not to try to access brastack[-1] when computing the length and restoring
04448     the branch_extra value. */
04449 
04450     case ')':
04451     length += 1 + LINK_SIZE;
04452     if (brastackptr > 0)
04453       {
04454       duplength = length - brastack[--brastackptr];
04455       branch_extra = bralenstack[brastackptr];
04456       }
04457     else duplength = 0;
04458 
04459     /* The following code is also used when a recursion such as (?3) is
04460     followed by a quantifier, because in that case, it has to be wrapped inside
04461     brackets so that the quantifier works. The value of duplength must be
04462     set before arrival. */
04463 
04464     HANDLE_QUANTIFIED_BRACKETS:
04465 
04466     /* Leave ptr at the final char; for read_repeat_counts this happens
04467     automatically; for the others we need an increment. */
04468 
04469     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
04470       {
04471       ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
04472       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04473       }
04474     else if (c == '*') { min = 0; max = -1; ptr++; }
04475     else if (c == '+') { min = 1; max = -1; ptr++; }
04476     else if (c == '?') { min = 0; max = 1;  ptr++; }
04477     else { min = 1; max = 1; }
04478 
04479     /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
04480     group, and if the maximum is greater than zero, we have to replicate
04481     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
04482     bracket set. */
04483 
04484     if (min == 0)
04485       {
04486       length++;
04487       if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
04488       }
04489 
04490     /* When the minimum is greater than zero, we have to replicate up to
04491     minval-1 times, with no additions required in the copies. Then, if there
04492     is a limited maximum we have to replicate up to maxval-1 times allowing
04493     for a BRAZERO item before each optional copy and nesting brackets for all
04494     but one of the optional copies. */
04495 
04496     else
04497       {
04498       length += (min - 1) * duplength;
04499       if (max > min)   /* Need this test as max=-1 means no limit */
04500         length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
04501           - (2 + 2*LINK_SIZE);
04502       }
04503 
04504     /* Allow space for once brackets for "possessive quantifier" */
04505 
04506     if (ptr[1] == '+')
04507       {
04508       ptr++;
04509       length += 2 + 2*LINK_SIZE;
04510       }
04511     continue;
04512 
04513     /* Non-special character. For a run of such characters the length required
04514     is the number of characters + 2, except that the maximum run length is
04515     MAXLIT. We won't get a skipped space or a non-data escape or the start of a
04516     # comment as the first character, so the length can't be zero. */
04517 
04518     NORMAL_CHAR:
04519     default:
04520     length += 2;
04521     runlength = 0;
04522     do
04523       {
04524 #ifdef SUPPORT_UTF8
04525       lastcharlength = 1;     /* Need length of last char for UTF-8 repeats */
04526 #endif
04527 
04528       /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
04529       if (inescq)
04530         {
04531         if (c == '\\' && ptr[1] == 'E')
04532           {
04533           inescq = FALSE;
04534           ptr++;
04535           }
04536         else runlength++;
04537         continue;
04538         }
04539 
04540       /* Skip whitespace and comments for /x */
04541 
04542       if ((options & PCRE_EXTENDED) != 0)
04543         {
04544         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
04545         if (c == '#')
04546           {
04547           /* The space before the ; is to avoid a warning on a silly compiler
04548           on the Macintosh. */
04549           while ((c = *(++ptr)) != 0 && c != NEWLINE) ; /*** FIXME: test LF too? ***/
04550           continue;
04551           }
04552         }
04553 
04554       /* Backslash may introduce a data char or a metacharacter; stop the
04555       string before the latter. */
04556 
04557       if (c == '\\')
04558         {
04559         const uschar *saveptr = ptr;
04560         c = check_escape(&ptr, errorptr, bracount, options, FALSE,
04561           &compile_block);
04562         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04563         if (c < 0) { ptr = saveptr; break; }
04564 
04565         /* In UTF-8 mode, add on the number of additional bytes needed to
04566         encode this character, and save the total length in case this is a
04567         final char that is repeated. */
04568 
04569 #ifdef SUPPORT_UTF8
04570         if (utf8 && c > 127)
04571           {
04572           int i;
04573           for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
04574             if (c <= utf8_table1[i]) break;
04575           runlength += i;
04576           lastcharlength += i;
04577           }
04578 #endif
04579         }
04580 
04581       /* Ordinary character or single-char escape */
04582 
04583       runlength++;
04584       }
04585 
04586     /* This "while" is the end of the "do" above. */
04587 
04588     while (runlength < MAXLIT &&
04589       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
04590 
04591     /* If we hit a meta-character, back off to point to it */
04592 
04593     if (runlength < MAXLIT) ptr--;
04594 
04595     /* If the last char in the string is a UTF-8 multibyte character, we must
04596     set lastcharlength correctly. If it was specified as an escape, this will
04597     already have been done above. However, we also have to support in-line
04598     UTF-8 characters, so check backwards from where we are. */
04599 
04600 #ifdef SUPPORT_UTF8
04601     if (utf8)
04602       {
04603       const uschar *lastptr = ptr - 1;
04604       if ((*lastptr & 0x80) != 0)
04605         {
04606         while((*lastptr & 0xc0) == 0x80) lastptr--;
04607         lastcharlength = ptr - lastptr;
04608         }
04609       }
04610 #endif
04611 
04612     length += runlength;
04613     continue;
04614     }
04615   }
04616 
04617 length += 2 + LINK_SIZE;    /* For final KET and END */
04618 
04619 if (length > MAX_PATTERN_SIZE)
04620   {
04621   *errorptr = ERR20;
04622   return NULL;
04623   }
04624 
04625 /* Compute the size of data block needed and get it, either from malloc or
04626 externally provided function. */
04627 
04628 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
04629 re = (real_pcre *)(pcre_malloc)(size);
04630 
04631 if (re == NULL)
04632   {
04633   *errorptr = ERR21;
04634   return NULL;
04635   }
04636 
04637 /* Put in the magic number, and save the size, options, and table pointer */
04638 
04639 re->magic_number = MAGIC_NUMBER;
04640 re->size = size;
04641 re->options = options;
04642 re->tables = tables;
04643 re->name_entry_size = max_name_size + 3;
04644 re->name_count = name_count;
04645 
04646 /* The starting points of the name/number translation table and of the code are
04647 passed around in the compile data block. */
04648 
04649 compile_block.names_found = 0;
04650 compile_block.name_entry_size = max_name_size + 3;
04651 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
04652 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
04653 compile_block.start_code = codestart;
04654 compile_block.req_varyopt = 0;
04655 
04656 /* Set up a starting, non-extracting bracket, then compile the expression. On
04657 error, *errorptr will be set non-NULL, so we don't need to look at the result
04658 of the function here. */
04659 
04660 ptr = (const uschar *)pattern;
04661 code = (uschar *)codestart;
04662 *code = OP_BRA;
04663 bracount = 0;
04664 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
04665   errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
04666 re->top_bracket = bracount;
04667 re->top_backref = compile_block.top_backref;
04668 
04669 /* If not reached end of pattern on success, there's an excess bracket. */
04670 
04671 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
04672 
04673 /* Fill in the terminating state and check for disastrous overflow, but
04674 if debugging, leave the test till after things are printed out. */
04675 
04676 *code++ = OP_END;
04677 
04678 #ifndef DEBUG
04679 if (code - codestart > length) *errorptr = ERR23;
04680 #endif
04681 
04682 /* Give an error if there's back reference to a non-existent capturing
04683 subpattern. */
04684 
04685 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
04686 
04687 /* Failed to compile, or error while post-processing */
04688 
04689 if (*errorptr != NULL)
04690   {
04691   (pcre_free)(re);
04692   PCRE_ERROR_RETURN:
04693   *erroroffset = ptr - (const uschar *)pattern;
04694   return NULL;
04695   }
04696 
04697 /* If the anchored option was not passed, set the flag if we can determine that
04698 the pattern is anchored by virtue of ^ characters or \A or anything else (such
04699 as starting with .* when DOTALL is set).
04700 
04701 Otherwise, if we know what the first character has to be, save it, because that
04702 speeds up unanchored matches no end. If not, see if we can set the
04703 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
04704 start with ^. and also when all branches start with .* for non-DOTALL matches.
04705 */
04706 
04707 if ((options & PCRE_ANCHORED) == 0)
04708   {
04709   int temp_options = options;
04710   if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
04711     re->options |= PCRE_ANCHORED;
04712   else
04713     {
04714     if (firstbyte < 0)
04715       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
04716     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
04717       {
04718       int ch = firstbyte & 255;
04719       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
04720          compile_block.fcc[ch] == ch)? ch : firstbyte;
04721       re->options |= PCRE_FIRSTSET;
04722       }
04723     else if (is_startline(codestart, 0, compile_block.backref_map))
04724       re->options |= PCRE_STARTLINE;
04725     }
04726   }
04727 
04728 /* For an anchored pattern, we use the "required byte" only if it follows a
04729 variable length item in the regex. Remove the caseless flag for non-caseable
04730 chars. */
04731 
04732 if (reqbyte >= 0 &&
04733      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
04734   {
04735   int ch = reqbyte & 255;
04736   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
04737     compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
04738   re->options |= PCRE_REQCHSET;
04739   }
04740 
04741 /* Print out the compiled data for debugging */
04742 
04743 #ifdef DEBUG
04744 
04745 printf("Length = %d top_bracket = %d top_backref = %d\n",
04746   length, re->top_bracket, re->top_backref);
04747 
04748 if (re->options != 0)
04749   {
04750   printf("%s%s%s%s%s%s%s%s%s\n",
04751     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
04752     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
04753     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
04754     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
04755     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
04756     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
04757     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
04758     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
04759     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
04760   }
04761 
04762 if ((re->options & PCRE_FIRSTSET) != 0)
04763   {
04764   int ch = re->first_byte & 255;
04765   char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
04766   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
04767     else printf("First char = \\x%02x%s\n", ch, caseless);
04768   }
04769 
04770 if ((re->options & PCRE_REQCHSET) != 0)
04771   {
04772   int ch = re->req_byte & 255;
04773   char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
04774   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
04775     else printf("Req char = \\x%02x%s\n", ch, caseless);
04776   }
04777 
04778 print_internals(re, stdout);
04779 
04780 /* This check is done here in the debugging case so that the code that
04781 was compiled can be seen. */
04782 
04783 if (code - codestart > length)
04784   {
04785   *errorptr = ERR23;
04786   (pcre_free)(re);
04787   *erroroffset = ptr - (uschar *)pattern;
04788   return NULL;
04789   }
04790 #endif
04791 
04792 return (pcre *)re;
04793 }
04794 
04795 
04796 
04797 /*************************************************
04798 *          Match a back-reference                *
04799 *************************************************/
04800 
04801 /* If a back reference hasn't been set, the length that is passed is greater
04802 than the number of characters left in the string, so the match fails.
04803 
04804 Arguments:
04805   offset      index into the offset vector
04806   eptr        points into the subject
04807   length      length to be matched
04808   md          points to match data block
04809   ims         the ims flags
04810 
04811 Returns:      TRUE if matched
04812 */
04813 
04814 static BOOL
04815 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
04816   unsigned long int ims)
04817 {
04818 const uschar *p = md->start_subject + md->offset_vector[offset];
04819 
04820 #ifdef DEBUG
04821 if (eptr >= md->end_subject)
04822   printf("matching subject <null>");
04823 else
04824   {
04825   printf("matching subject ");
04826   pchars(eptr, length, TRUE, md);
04827   }
04828 printf(" against backref ");
04829 pchars(p, length, FALSE, md);
04830 printf("\n");
04831 #endif
04832 
04833 /* Always fail if not enough characters left */
04834 
04835 if (length > md->end_subject - eptr) return FALSE;
04836 
04837 /* Separate the caselesss case for speed */
04838 
04839 if ((ims & PCRE_CASELESS) != 0)
04840   {
04841   while (length-- > 0)
04842     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
04843   }
04844 else
04845   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
04846 
04847 return TRUE;
04848 }
04849 
04850 
04851 #ifdef SUPPORT_UTF8
04852 /*************************************************
04853 *       Match character against an XCLASS        *
04854 *************************************************/
04855 
04856 /* This function is called from within the XCLASS code below, to match a
04857 character against an extended class which might match values > 255.
04858 
04859 Arguments:
04860   c           the character
04861   data        points to the flag byte of the XCLASS data
04862 
04863 Returns:      TRUE if character matches, else FALSE
04864 */
04865 
04866 static BOOL
04867 match_xclass(int c, const uschar *data)
04868 {
04869 int t;
04870 BOOL negated = (*data & XCL_NOT) != 0;
04871 
04872 /* Character values < 256 are matched against a bitmap, if one is present. If
04873 not, we still carry on, because there may be ranges that start below 256 in the
04874 additional data. */
04875 
04876 if (c < 256)
04877   {
04878   if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
04879     return !negated;   /* char found */
04880   }
04881 
04882 /* Now match against the list of large chars or ranges that end with a large
04883 char. First skip the bit map if present. */
04884 
04885 if ((*data++ & XCL_MAP) != 0) data += 32;
04886 
04887 while ((t = *data++) != XCL_END)
04888   {
04889   int x, y;
04890   GETCHARINC(x, data);
04891   if (t == XCL_SINGLE)
04892     {
04893     if (c == x) return !negated;
04894     }
04895   else
04896     {
04897     GETCHARINC(y, data);
04898     if (c >= x && c <= y) return !negated;
04899     }
04900   }
04901 
04902 return negated;   /* char was not found */
04903 }
04904 #endif
04905 
04906 
04907 
04908 
04909 /*************************************************
04910 *         Match from current position            *
04911 *************************************************/
04912 
04913 /* On entry ecode points to the first opcode, and eptr to the first character
04914 in the subject string, while eptrb holds the value of eptr at the start of the
04915 last bracketed group - used for breaking infinite loops matching zero-length
04916 strings. This function is called recursively in many circumstances. Whenever it
04917 returns a negative (error) response, the outer incarnation must also return the
04918 same response.
04919 
04920 Performance note: It might be tempting to extract commonly used fields from the
04921 md structure (e.g. utf8, end_subject) into individual variables to improve
04922 performance. Tests using gcc on a SPARC disproved this; in the first case, it
04923 made performance worse.
04924 
04925 Arguments:
04926    eptr        pointer in subject
04927    ecode       position in code
04928    offset_top  current top pointer
04929    md          pointer to "static" info for the match
04930    ims         current /i, /m, and /s options
04931    eptrb       pointer to chain of blocks containing eptr at start of
04932                  brackets - for testing for empty matches
04933    flags       can contain
04934                  match_condassert - this is an assertion condition
04935                  match_isgroup - this is the start of a bracketed group
04936 
04937 Returns:       MATCH_MATCH if matched            )  these values are >= 0
04938                MATCH_NOMATCH if failed to match  )
04939                a negative PCRE_ERROR_xxx value if aborted by an error condition
04940                  (e.g. stopped by recursion limit)
04941 */
04942 
04943 static int
04944 match(register const uschar *eptr, register const uschar *ecode,
04945   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
04946   int flags)
04947 {
04948 unsigned long int original_ims = ims;   /* Save for resetting on ')' */
04949 register int rrc;
04950 eptrblock newptrb;
04951 
04952 if (md->match_call_count++ >= md->match_limit) return PCRE_ERROR_MATCHLIMIT;
04953 
04954 /* At the start of a bracketed group, add the current subject pointer to the
04955 stack of such pointers, to be re-instated at the end of the group when we hit
04956 the closing ket. When match() is called in other circumstances, we don't add to
04957 the stack. */
04958 
04959 if ((flags & match_isgroup) != 0)
04960   {
04961   newptrb.prev = eptrb;
04962   newptrb.saved_eptr = eptr;
04963   eptrb = &newptrb;
04964   }
04965 
04966 /* Now start processing the operations. */
04967 
04968 for (;;)
04969   {
04970   int op = (int)*ecode;
04971   int min, max, ctype;
04972   register int i;
04973   register int c;
04974   BOOL minimize = FALSE;
04975 
04976   /* Opening capturing bracket. If there is space in the offset vector, save
04977   the current subject position in the working slot at the top of the vector. We
04978   mustn't change the current values of the data slot, because they may be set
04979   from a previous iteration of this group, and be referred to by a reference
04980   inside the group.
04981 
04982   If the bracket fails to match, we need to restore this value and also the
04983   values of the final offsets, in case they were set by a previous iteration of
04984   the same bracket.
04985 
04986   If there isn't enough space in the offset vector, treat this as if it were a
04987   non-capturing bracket. Don't worry about setting the flag for the error case
04988   here; that is handled in the code for KET. */
04989 
04990   if (op > OP_BRA)
04991     {
04992     int offset;
04993     int number = op - OP_BRA;
04994 
04995     /* For extended extraction brackets (large number), we have to fish out the
04996     number from a dummy opcode at the start. */
04997 
04998     if (number > EXTRACT_BASIC_MAX)
04999       number = GET2(ecode, 2+LINK_SIZE);
05000     offset = number << 1;
05001 
05002 #ifdef DEBUG
05003     printf("start bracket %d subject=", number);
05004     pchars(eptr, 16, TRUE, md);
05005     printf("\n");
05006 #endif
05007 
05008     if (offset < md->offset_max)
05009       {
05010       int save_offset1 = md->offset_vector[offset];
05011       int save_offset2 = md->offset_vector[offset+1];
05012       int save_offset3 = md->offset_vector[md->offset_end - number];
05013       int save_capture_last = md->capture_last;
05014 
05015       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
05016       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
05017 
05018       do
05019         {
05020         if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
05021               eptrb, match_isgroup)) != MATCH_NOMATCH) return rrc;
05022         md->capture_last = save_capture_last;
05023         ecode += GET(ecode, 1);
05024         }
05025       while (*ecode == OP_ALT);
05026 
05027       DPRINTF(("bracket %d failed\n", number));
05028 
05029       md->offset_vector[offset] = save_offset1;
05030       md->offset_vector[offset+1] = save_offset2;
05031       md->offset_vector[md->offset_end - number] = save_offset3;
05032 
05033       return MATCH_NOMATCH;
05034       }
05035 
05036     /* Insufficient room for saving captured contents */
05037 
05038     else op = OP_BRA;
05039     }
05040 
05041   /* Other types of node can be handled by a switch */
05042 
05043   switch(op)
05044     {
05045     case OP_BRA:     /* Non-capturing bracket: optimized */
05046     DPRINTF(("start bracket 0\n"));
05047     do
05048       {
05049       if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
05050         match_isgroup)) != MATCH_NOMATCH) return rrc;
05051       ecode += GET(ecode, 1);
05052       }
05053     while (*ecode == OP_ALT);
05054     DPRINTF(("bracket 0 failed\n"));
05055     return MATCH_NOMATCH;
05056 
05057     /* Conditional group: compilation checked that there are no more than
05058     two branches. If the condition is false, skipping the first branch takes us
05059     past the end if there is only one branch, but that's OK because that is
05060     exactly what going to the ket would do. */
05061 
05062     case OP_COND:
05063     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
05064       {
05065       int offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
05066       BOOL condition = (offset == CREF_RECURSE * 2)?
05067         (md->recursive != NULL) :
05068         (offset < offset_top && md->offset_vector[offset] >= 0);
05069       return match(eptr, ecode + (condition?
05070         (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
05071         offset_top, md, ims, eptrb, match_isgroup);
05072       }
05073 
05074     /* The condition is an assertion. Call match() to evaluate it - setting
05075     the final argument TRUE causes it to stop at the end of an assertion. */
05076 
05077     else
05078       {
05079       if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05080           match_condassert | match_isgroup)) == MATCH_MATCH)
05081         {
05082         ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
05083         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
05084         }
05085       else if (rrc != MATCH_NOMATCH) return rrc;
05086       else ecode += GET(ecode, 1);
05087       return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
05088         match_isgroup);
05089       }
05090     /* Control never reaches here */
05091 
05092     /* Skip over conditional reference or large extraction number data if
05093     encountered. */
05094 
05095     case OP_CREF:
05096     case OP_BRANUMBER:
05097     ecode += 3;
05098     break;
05099 
05100     /* End of the pattern. If we are in a recursion, we should restore the
05101     offsets appropriately and continue from after the call. */
05102 
05103     case OP_END:
05104     if (md->recursive != NULL && md->recursive->group_num == 0)
05105       {
05106       recursion_info *rec = md->recursive;
05107       DPRINTF(("Hit the end in a (?0) recursion\n"));
05108       md->recursive = rec->prev;
05109       memmove(md->offset_vector, rec->offset_save,
05110         rec->saved_max * sizeof(int));
05111       md->start_match = rec->save_start;
05112       ims = original_ims;
05113       ecode = rec->after_call;
05114       break;
05115       }
05116 
05117     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
05118     string - backtracking will then try other alternatives, if any. */
05119 
05120     if (md->notempty && eptr == md->start_match) return MATCH_NOMATCH;
05121     md->end_match_ptr = eptr;          /* Record where we ended */
05122     md->end_offset_top = offset_top;   /* and how many extracts were taken */
05123     return MATCH_MATCH;
05124 
05125     /* Change option settings */
05126 
05127     case OP_OPT:
05128     ims = ecode[1];
05129     ecode += 2;
05130     DPRINTF(("ims set to %02lx\n", ims));
05131     break;
05132 
05133     /* Assertion brackets. Check the alternative branches in turn - the
05134     matching won't pass the KET for an assertion. If any one branch matches,
05135     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
05136     start of each branch to move the current point backwards, so the code at
05137     this level is identical to the lookahead case. */
05138 
05139     case OP_ASSERT:
05140     case OP_ASSERTBACK:
05141     do
05142       {
05143       if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05144         match_isgroup)) == MATCH_MATCH) break;
05145       if (rrc != MATCH_NOMATCH) return rrc;
05146       ecode += GET(ecode, 1);
05147       }
05148     while (*ecode == OP_ALT);
05149     if (*ecode == OP_KET) return MATCH_NOMATCH;
05150 
05151     /* If checking an assertion for a condition, return MATCH_MATCH. */
05152 
05153     if ((flags & match_condassert) != 0) return MATCH_MATCH;
05154 
05155     /* Continue from after the assertion, updating the offsets high water
05156     mark, since extracts may have been taken during the assertion. */
05157 
05158     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05159     ecode += 1 + LINK_SIZE;
05160     offset_top = md->end_offset_top;
05161     continue;
05162 
05163     /* Negative assertion: all branches must fail to match */
05164 
05165     case OP_ASSERT_NOT:
05166     case OP_ASSERTBACK_NOT:
05167     do
05168       {
05169       if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05170         match_isgroup)) == MATCH_MATCH) return MATCH_NOMATCH;
05171       if (rrc != MATCH_NOMATCH) return rrc;
05172       ecode += GET(ecode,1);
05173       }
05174     while (*ecode == OP_ALT);
05175 
05176     if ((flags & match_condassert) != 0) return MATCH_MATCH;
05177 
05178     ecode += 1 + LINK_SIZE;
05179     continue;
05180 
05181     /* Move the subject pointer back. This occurs only at the start of
05182     each branch of a lookbehind assertion. If we are too close to the start to
05183     move back, this match function fails. When working with UTF-8 we move
05184     back a number of characters, not bytes. */
05185 
05186     case OP_REVERSE:
05187 #ifdef SUPPORT_UTF8
05188     c = GET(ecode,1);
05189     for (i = 0; i < c; i++)
05190       {
05191       eptr--;
05192       BACKCHAR(eptr)
05193       }
05194 #else
05195     eptr -= GET(ecode,1);
05196 #endif
05197 
05198     if (eptr < md->start_subject) return MATCH_NOMATCH;
05199     ecode += 1 + LINK_SIZE;
05200     break;
05201 
05202     /* The callout item calls an external function, if one is provided, passing
05203     details of the match so far. This is mainly for debugging, though the
05204     function is able to force a failure. */
05205 
05206     case OP_CALLOUT:
05207     if (pcre_callout != NULL)
05208       {
05209       pcre_callout_block cb;
05210       cb.version          = 0;   /* Version 0 of the callout block */
05211       cb.callout_number   = ecode[1];
05212       cb.offset_vector    = md->offset_vector;
05213       cb.subject          = (const char *)md->start_subject;
05214       cb.subject_length   = md->end_subject - md->start_subject;
05215       cb.start_match      = md->start_match - md->start_subject;
05216       cb.current_position = eptr - md->start_subject;
05217       cb.capture_top      = offset_top/2;
05218       cb.capture_last     = md->capture_last;
05219       cb.callout_data     = md->callout_data;
05220       if ((rrc = (*pcre_callout)(&cb)) > 0) return MATCH_NOMATCH;
05221       if (rrc < 0) return rrc;
05222       }
05223     ecode += 2;
05224     break;
05225 
05226     /* Recursion either matches the current regex, or some subexpression. The
05227     offset data is the offset to the starting bracket from the start of the
05228     whole pattern. However, it is possible that a BRAZERO was inserted before
05229     this bracket after we took the offset - we just skip it if encountered.
05230 
05231     If there are any capturing brackets started but not finished, we have to
05232     save their starting points and reinstate them after the recursion. However,
05233     we don't know how many such there are (offset_top records the completed
05234     total) so we just have to save all the potential data. There may be up to
05235     65535 such values, which is too large to put on the stack, but using malloc
05236     for small numbers seems expensive. As a compromise, the stack is used when
05237     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
05238     is used. A problem is what to do if the malloc fails ... there is no way of
05239     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
05240     values on the stack, and accept that the rest may be wrong.
05241 
05242     There are also other values that have to be saved. We use a chained
05243     sequence of blocks that actually live on the stack. Thanks to Robin Houston
05244     for the original version of this logic. */
05245 
05246     case OP_RECURSE:
05247       {
05248       int stacksave[REC_STACK_SAVE_MAX];
05249       recursion_info new_recursive;
05250       const uschar *callpat = md->start_code + GET(ecode, 1);
05251 
05252       if (*callpat == OP_BRAZERO) callpat++;
05253 
05254       new_recursive.group_num = *callpat - OP_BRA;
05255 
05256       /* For extended extraction brackets (large number), we have to fish out
05257       the number from a dummy opcode at the start. */
05258 
05259       if (new_recursive.group_num > EXTRACT_BASIC_MAX)
05260         new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
05261 
05262       /* Add to "recursing stack" */
05263 
05264       new_recursive.prev = md->recursive;
05265       md->recursive = &new_recursive;
05266 
05267       /* Find where to continue from afterwards */
05268 
05269       ecode += 1 + LINK_SIZE;
05270       new_recursive.after_call = ecode;
05271 
05272       /* Now save the offset data. */
05273 
05274       new_recursive.saved_max = md->offset_end;
05275       if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
05276         new_recursive.offset_save = stacksave;
05277       else
05278         {
05279         new_recursive.offset_save =
05280           (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
05281         if (new_recursive.offset_save == NULL) return PCRE_ERROR_NOMEMORY;
05282         }
05283 
05284       memcpy(new_recursive.offset_save, md->offset_vector,
05285             new_recursive.saved_max * sizeof(int));
05286       new_recursive.save_start = md->start_match;
05287       md->start_match = eptr;
05288 
05289       /* OK, now we can do the recursion. For each top-level alternative we
05290       restore the offset and recursion data. */
05291 
05292       DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
05293       do
05294         {
05295         if ((rrc = match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
05296             eptrb, match_isgroup)) == MATCH_MATCH)
05297           {
05298           md->recursive = new_recursive.prev;
05299           if (new_recursive.offset_save != stacksave)
05300             (pcre_free)(new_recursive.offset_save);
05301           return MATCH_MATCH;
05302           }
05303         else if (rrc != MATCH_NOMATCH) return rrc;
05304 
05305         md->recursive = &new_recursive;
05306         memcpy(md->offset_vector, new_recursive.offset_save,
05307             new_recursive.saved_max * sizeof(int));
05308         callpat += GET(callpat, 1);
05309         }
05310       while (*callpat == OP_ALT);
05311 
05312       DPRINTF(("Recursion didn't match\n"));
05313       md->recursive = new_recursive.prev;
05314       if (new_recursive.offset_save != stacksave)
05315         (pcre_free)(new_recursive.offset_save);
05316       return MATCH_NOMATCH;
05317       }
05318     /* Control never reaches here */
05319 
05320     /* "Once" brackets are like assertion brackets except that after a match,
05321     the point in the subject string is not moved back. Thus there can never be
05322     a move back into the brackets. Friedl calls these "atomic" subpatterns.
05323     Check the alternative branches in turn - the matching won't pass the KET
05324     for this kind of subpattern. If any one branch matches, we carry on as at
05325     the end of a normal bracket, leaving the subject pointer. */
05326 
05327     case OP_ONCE:
05328       {
05329       const uschar *prev = ecode;
05330       const uschar *saved_eptr = eptr;
05331 
05332       do
05333         {
05334         if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
05335           eptrb, match_isgroup)) == MATCH_MATCH) break;
05336         if (rrc != MATCH_NOMATCH) return rrc;
05337         ecode += GET(ecode,1);
05338         }
05339       while (*ecode == OP_ALT);
05340 
05341       /* If hit the end of the group (which could be repeated), fail */
05342 
05343       if (*ecode != OP_ONCE && *ecode != OP_ALT) return MATCH_NOMATCH;
05344 
05345       /* Continue as from after the assertion, updating the offsets high water
05346       mark, since extracts may have been taken. */
05347 
05348       do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05349 
05350       offset_top = md->end_offset_top;
05351       eptr = md->end_match_ptr;
05352 
05353       /* For a non-repeating ket, just continue at this level. This also
05354       happens for a repeating ket if no characters were matched in the group.
05355       This is the forcible breaking of infinite loops as implemented in Perl
05356       5.005. If there is an options reset, it will get obeyed in the normal
05357       course of events. */
05358 
05359       if (*ecode == OP_KET || eptr == saved_eptr)
05360         {
05361         ecode += 1+LINK_SIZE;
05362         break;
05363         }
05364 
05365       /* The repeating kets try the rest of the pattern or restart from the
05366       preceding bracket, in the appropriate order. We need to reset any options
05367       that changed within the bracket before re-running it, so check the next
05368       opcode. */
05369 
05370       if (ecode[1+LINK_SIZE] == OP_OPT)
05371         {
05372         ims = (ims & ~PCRE_IMS) | ecode[4];
05373         DPRINTF(("ims set to %02lx at group repeat\n", ims));
05374         }
05375 
05376       if (*ecode == OP_KETRMIN)
05377         {
05378         if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
05379              eptrb, 0)) != MATCH_NOMATCH) return rrc;
05380         if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
05381              match_isgroup)) != MATCH_NOMATCH) return rrc;
05382         }
05383       else  /* OP_KETRMAX */
05384         {
05385         if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
05386              match_isgroup)) != MATCH_NOMATCH) return rrc;
05387         if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
05388              0)) != MATCH_NOMATCH) return rrc;
05389         }
05390       }
05391     return MATCH_NOMATCH;
05392 
05393     /* An alternation is the end of a branch; scan along to find the end of the
05394     bracketed group and go to there. */
05395 
05396     case OP_ALT:
05397     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05398     break;
05399 
05400     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
05401     that it may occur zero times. It may repeat infinitely, or not at all -
05402     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
05403     repeat limits are compiled as a number of copies, with the optional ones
05404     preceded by BRAZERO or BRAMINZERO. */
05405 
05406     case OP_BRAZERO:
05407       {
05408       const uschar *next = ecode+1;
05409       if ((rrc = match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
05410            != MATCH_NOMATCH) return rrc;
05411       do next += GET(next,1); while (*next == OP_ALT);
05412       ecode = next + 1+LINK_SIZE;
05413       }
05414     break;
05415 
05416     case OP_BRAMINZERO:
05417       {
05418       const uschar *next = ecode+1;
05419       do next += GET(next,1); while (*next == OP_ALT);
05420       if ((rrc = match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
05421         match_isgroup)) != MATCH_NOMATCH) return rrc;
05422       ecode++;
05423       }
05424     break;
05425 
05426     /* End of a group, repeated or non-repeating. If we are at the end of
05427     an assertion "group", stop matching and return MATCH_MATCH, but record the
05428     current high water mark for use by positive assertions. Do this also
05429     for the "once" (not-backup up) groups. */
05430 
05431     case OP_KET:
05432     case OP_KETRMIN:
05433     case OP_KETRMAX:
05434       {
05435       const uschar *prev = ecode - GET(ecode, 1);
05436       const uschar *saved_eptr = eptrb->saved_eptr;
05437 
05438       eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
05439 
05440       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
05441           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
05442           *prev == OP_ONCE)
05443         {
05444         md->end_match_ptr = eptr;      /* For ONCE */
05445         md->end_offset_top = offset_top;
05446         return MATCH_MATCH;
05447         }
05448 
05449       /* In all other cases except a conditional group we have to check the
05450       group number back at the start and if necessary complete handling an
05451       extraction by setting the offsets and bumping the high water mark. */
05452 
05453       if (*prev != OP_COND)
05454         {
05455         int offset;
05456         int number = *prev - OP_BRA;
05457 
05458         /* For extended extraction brackets (large number), we have to fish out
05459         the number from a dummy opcode at the start. */
05460 
05461         if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
05462         offset = number << 1;
05463 
05464 #ifdef DEBUG
05465         printf("end bracket %d", number);
05466         printf("\n");
05467 #endif
05468 
05469         /* Test for a numbered group. This includes groups called as a result
05470         of recursion. Note that whole-pattern recursion is coded as a recurse
05471         into group 0, so it won't be picked up here. Instead, we catch it when
05472         the OP_END is reached. */
05473 
05474         if (number > 0)
05475           {
05476           md->capture_last = number;
05477           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
05478             {
05479             md->offset_vector[offset] =
05480               md->offset_vector[md->offset_end - number];
05481             md->offset_vector[offset+1] = eptr - md->start_subject;
05482             if (offset_top <= offset) offset_top = offset + 2;
05483             }
05484 
05485           /* Handle a recursively called group. Restore the offsets
05486           appropriately and continue from after the call. */
05487 
05488           if (md->recursive != NULL && md->recursive->group_num == number)
05489             {
05490             recursion_info *rec = md->recursive;
05491             DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
05492             md->recursive = rec->prev;
05493             md->start_match = rec->save_start;
05494             memcpy(md->offset_vector, rec->offset_save,
05495               rec->saved_max * sizeof(int));
05496             ecode = rec->after_call;
05497             ims = original_ims;
05498             break;
05499             }
05500           }
05501         }
05502 
05503       /* Reset the value of the ims flags, in case they got changed during
05504       the group. */
05505 
05506       ims = original_ims;
05507       DPRINTF(("ims reset to %02lx\n", ims));
05508 
05509       /* For a non-repeating ket, just continue at this level. This also
05510       happens for a repeating ket if no characters were matched in the group.
05511       This is the forcible breaking of infinite loops as implemented in Perl
05512       5.005. If there is an options reset, it will get obeyed in the normal
05513       course of events. */
05514 
05515       if (*ecode == OP_KET || eptr == saved_eptr)
05516         {
05517         ecode += 1 + LINK_SIZE;
05518         break;
05519         }
05520 
05521       /* The repeating kets try the rest of the pattern or restart from the
05522       preceding bracket, in the appropriate order. */
05523 
05524       if (*ecode == OP_KETRMIN)
05525         {
05526         if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
05527              0)) != MATCH_NOMATCH) return rrc;
05528         if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
05529              match_isgroup)) != MATCH_NOMATCH) return rrc;
05530         }
05531       else  /* OP_KETRMAX */
05532         {
05533         if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
05534              match_isgroup)) != MATCH_NOMATCH) return rrc;
05535         if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
05536              0)) != MATCH_NOMATCH) return rrc;
05537         }
05538       }
05539     return MATCH_NOMATCH;
05540 
05541     /* Start of subject unless notbol, or after internal newline if multiline */
05542 
05543     case OP_CIRC:
05544     if (md->notbol && eptr == md->start_subject) return MATCH_NOMATCH;
05545     if ((ims & PCRE_MULTILINE) != 0)
05546       {
05547       if (eptr != md->start_subject && eptr[-1] != NEWLINE) /*** FIXME: test LF too? ***/
05548         return MATCH_NOMATCH;
05549       ecode++;
05550       break;
05551       }
05552     /* ... else fall through */
05553 
05554     /* Start of subject assertion */
05555 
05556     case OP_SOD:
05557     if (eptr != md->start_subject) return MATCH_NOMATCH;
05558     ecode++;
05559     break;
05560 
05561     /* Start of match assertion */
05562 
05563     case OP_SOM:
05564     if (eptr != md->start_subject + md->start_offset) return MATCH_NOMATCH;
05565     ecode++;
05566     break;
05567 
05568     /* Assert before internal newline if multiline, or before a terminating
05569     newline unless endonly is set, else end of subject unless noteol is set. */
05570 
05571     case OP_DOLL:
05572     if ((ims & PCRE_MULTILINE) != 0)
05573       {
05574       if (eptr < md->end_subject)
05575         { if (*eptr != NEWLINE) return MATCH_NOMATCH; } /*** FIXME: test LF too? ***/
05576       else
05577         { if (md->noteol) return MATCH_NOMATCH; }
05578       ecode++;
05579       break;
05580       }
05581     else
05582       {
05583       if (md->noteol) return MATCH_NOMATCH;
05584       if (!md->endonly)
05585         {
05586         if (eptr < md->end_subject - 1 ||
05587            (eptr == md->end_subject - 1 && *eptr != NEWLINE)) /*** FIXME: test LF too? ***/
05588           return MATCH_NOMATCH;
05589         ecode++;
05590         break;
05591         }
05592       }
05593     /* ... else fall through */
05594 
05595     /* End of subject assertion (\z) */
05596 
05597     case OP_EOD:
05598     if (eptr < md->end_subject) return MATCH_NOMATCH;
05599     ecode++;
05600     break;
05601 
05602     /* End of subject or ending \n assertion (\Z) */
05603 
05604     case OP_EODN:
05605     if (eptr < md->end_subject - 1 ||
05606        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return MATCH_NOMATCH; /*** FIXME: test LF too? ***/
05607     ecode++;
05608     break;
05609 
05610     /* Word boundary assertions */
05611 
05612     case OP_NOT_WORD_BOUNDARY:
05613     case OP_WORD_BOUNDARY:
05614       {
05615       BOOL prev_is_word, cur_is_word;
05616 
05617       /* Find out if the previous and current characters are "word" characters.
05618       It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
05619       be "non-word" characters. */
05620 
05621 #ifdef SUPPORT_UTF8
05622       if (md->utf8)
05623         {
05624         if (eptr == md->start_subject) prev_is_word = FALSE; else
05625           {
05626           const uschar *lastptr = eptr - 1;
05627           while((*lastptr & 0xc0) == 0x80) lastptr--;
05628           GETCHAR(c, lastptr);
05629           prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
05630           }
05631         if (eptr >= md->end_subject) cur_is_word = FALSE; else
05632           {
05633           GETCHAR(c, eptr);
05634           cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
05635           }
05636         }
05637       else
05638 #endif
05639 
05640       /* More streamlined when not in UTF-8 mode */
05641 
05642         {
05643         prev_is_word = (eptr != md->start_subject) &&
05644           ((md->ctypes[eptr[-1]] & ctype_word) != 0);
05645         cur_is_word = (eptr < md->end_subject) &&
05646           ((md->ctypes[*eptr] & ctype_word) != 0);
05647         }
05648 
05649       /* Now see if the situation is what we want */
05650 
05651       if ((*ecode++ == OP_WORD_BOUNDARY)?
05652            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
05653         return MATCH_NOMATCH;
05654       }
05655     break;
05656 
05657     /* Match a single character type; inline for speed */
05658 
05659     case OP_ANY:
05660     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE) /*** FIXME: test LF too? ***/
05661       return MATCH_NOMATCH;
05662     if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
05663 #ifdef SUPPORT_UTF8
05664     if (md->utf8)
05665       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
05666 #endif
05667     ecode++;
05668     break;
05669 
05670     /* Match a single byte, even in UTF-8 mode. This opcode really does match
05671     any byte, even newline, independent of the setting of PCRE_DOTALL. */
05672 
05673     case OP_ANYBYTE:
05674     if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
05675     ecode++;
05676     break;
05677 
05678     case OP_NOT_DIGIT:
05679     if (eptr >= md->end_subject) return MATCH_NOMATCH;
05680     GETCHARINCTEST(c, eptr);
05681     if (
05682 #ifdef SUPPORT_UTF8
05683        c < 256 &&
05684 #endif
05685        (md->ctypes[c] & ctype_digit) != 0
05686        )
05687       return MATCH_NOMATCH;
05688     ecode++;
05689     break;
05690 
05691     case OP_DIGIT:
05692     if (eptr >= md->end_subject) return MATCH_NOMATCH;
05693     GETCHARINCTEST(c, eptr);
05694     if (
05695 #ifdef SUPPORT_UTF8
05696        c >= 256 ||
05697 #endif
05698        (md->ctypes[c] & ctype_digit) == 0
05699        )
05700       return MATCH_NOMATCH;
05701     ecode++;
05702     break;
05703 
05704     case OP_NOT_WHITESPACE:
05705     if (eptr >= md->end_subject) return MATCH_NOMATCH;
05706     GETCHARINCTEST(c, eptr);
05707     if (
05708 #ifdef SUPPORT_UTF8
05709        c < 256 &&
05710 #endif
05711        (md->ctypes[c] & ctype_space) != 0
05712        )
05713       return MATCH_NOMATCH;
05714     ecode++;
05715     break;
05716 
05717     case OP_WHITESPACE:
05718     if (eptr >= md->end_subject) return MATCH_NOMATCH;
05719     GETCHARINCTEST(c, eptr);
05720     if (
05721 #ifdef SUPPORT_UTF8
05722        c >= 256 ||
05723 #endif
05724        (md->ctypes[c] & ctype_space) == 0
05725        )
05726       return MATCH_NOMATCH;
05727     ecode++;
05728     break;
05729 
05730     case OP_NOT_WORDCHAR:
05731     if (eptr >= md->end_subject) return MATCH_NOMATCH;
05732     GETCHARINCTEST(c, eptr);
05733     if (
05734 #ifdef SUPPORT_UTF8
05735        c < 256 &&
05736 #endif
05737        (md->ctypes[c] & ctype_word) != 0
05738        )
05739       return MATCH_NOMATCH;
05740     ecode++;
05741     break;
05742 
05743     case OP_WORDCHAR:
05744     if (eptr >= md->end_subject) return MATCH_NOMATCH;
05745     GETCHARINCTEST(c, eptr);
05746     if (
05747 #ifdef SUPPORT_UTF8
05748        c >= 256 ||
05749 #endif
05750        (md->ctypes[c] & ctype_word) == 0
05751        )
05752       return MATCH_NOMATCH;
05753     ecode++;
05754     break;
05755 
05756     /* Match a back reference, possibly repeatedly. Look past the end of the
05757     item to see if there is repeat information following. The code is similar
05758     to that for character classes, but repeated for efficiency. Then obey
05759     similar code to character type repeats - written out again for speed.
05760     However, if the referenced string is the empty string, always treat
05761     it as matched, any number of times (otherwise there could be infinite
05762     loops). */
05763 
05764     case OP_REF:
05765       {
05766       int length;
05767       int offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
05768       ecode += 3;                                     /* Advance past item */
05769 
05770       /* If the reference is unset, set the length to be longer than the amount
05771       of subject left; this ensures that every attempt at a match fails. We
05772       can't just fail here, because of the possibility of quantifiers with zero
05773       minima. */
05774 
05775       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
05776         md->end_subject - eptr + 1 :
05777         md->offset_vector[offset+1] - md->offset_vector[offset];
05778 
05779       /* Set up for repetition, or handle the non-repeated case */
05780 
05781       switch (*ecode)
05782         {
05783         case OP_CRSTAR:
05784         case OP_CRMINSTAR:
05785         case OP_CRPLUS:
05786         case OP_CRMINPLUS:
05787         case OP_CRQUERY:
05788         case OP_CRMINQUERY:
05789         c = *ecode++ - OP_CRSTAR;
05790         minimize = (c & 1) != 0;
05791         min = rep_min[c];                 /* Pick up values from tables; */
05792         max = rep_max[c];                 /* zero for max => infinity */
05793         if (max == 0) max = INT_MAX;
05794         break;
05795 
05796         case OP_CRRANGE:
05797         case OP_CRMINRANGE:
05798         minimize = (*ecode == OP_CRMINRANGE);
05799         min = GET2(ecode, 1);
05800         max = GET2(ecode, 3);
05801         if (max == 0) max = INT_MAX;
05802         ecode += 5;
05803         break;
05804 
05805         default:               /* No repeat follows */
05806         if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
05807         eptr += length;
05808         continue;              /* With the main loop */
05809         }
05810 
05811       /* If the length of the reference is zero, just continue with the
05812       main loop. */
05813 
05814       if (length == 0) continue;
05815 
05816       /* First, ensure the minimum number of matches are present. We get back
05817       the length of the reference string explicitly rather than passing the
05818       address of eptr, so that eptr can be a register variable. */
05819 
05820       for (i = 1; i <= min; i++)
05821         {
05822         if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
05823         eptr += length;
05824         }
05825 
05826       /* If min = max, continue at the same level without recursion.
05827       They are not both allowed to be zero. */
05828 
05829       if (min == max) continue;
05830 
05831       /* If minimizing, keep trying and advancing the pointer */
05832 
05833       if (minimize)
05834         {
05835         for (i = min;; i++)
05836           {
05837           if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
05838                MATCH_NOMATCH) return rrc;
05839           if (i >= max || !match_ref(offset, eptr, length, md, ims))
05840             return MATCH_NOMATCH;
05841           eptr += length;
05842           }
05843         /* Control never gets here */
05844         }
05845 
05846       /* If maximizing, find the longest string and work backwards */
05847 
05848       else
05849         {
05850         const uschar *pp = eptr;
05851         for (i = min; i < max; i++)
05852           {
05853           if (!match_ref(offset, eptr, length, md, ims)) break;
05854           eptr += length;
05855           }
05856         while (eptr >= pp)
05857           {
05858           if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
05859                MATCH_NOMATCH) return rrc;
05860           eptr -= length;
05861           }
05862         return MATCH_NOMATCH;
05863         }
05864       }
05865     /* Control never gets here */
05866 
05867 
05868 
05869     /* Match a bit-mapped character class, possibly repeatedly. This op code is
05870     used when all the characters in the class have values in the range 0-255.
05871     The only difference between OP_CLASS and OP_NCLASS occurs when a data
05872     character outside the range is encountered.
05873 
05874     First, look past the end of the item to see if there is repeat information
05875     following. Then obey similar code to character type repeats - written out
05876     again for speed. */
05877 
05878     case OP_NCLASS:
05879     case OP_CLASS:
05880       {
05881       const uschar *data = ecode + 1;  /* Save for matching */
05882       ecode += 33;                     /* Advance past the item */
05883 
05884       switch (*ecode)
05885         {
05886         case OP_CRSTAR:
05887         case OP_CRMINSTAR:
05888         case OP_CRPLUS:
05889         case OP_CRMINPLUS:
05890         case OP_CRQUERY:
05891         case OP_CRMINQUERY:
05892         c = *ecode++ - OP_CRSTAR;
05893         minimize = (c & 1) != 0;
05894         min = rep_min[c];                 /* Pick up values from tables; */
05895         max = rep_max[c];                 /* zero for max => infinity */
05896         if (max == 0) max = INT_MAX;
05897         break;
05898 
05899         case OP_CRRANGE:
05900         case OP_CRMINRANGE:
05901         minimize = (*ecode == OP_CRMINRANGE);
05902         min = GET2(ecode, 1);
05903         max = GET2(ecode, 3);
05904         if (max == 0) max = INT_MAX;
05905         ecode += 5;
05906         break;
05907 
05908         default:               /* No repeat follows */
05909         min = max = 1;
05910         break;
05911         }
05912 
05913       /* First, ensure the minimum number of matches are present. */
05914 
05915 #ifdef SUPPORT_UTF8
05916       /* UTF-8 mode */
05917       if (md->utf8)
05918         {
05919         for (i = 1; i <= min; i++)
05920           {
05921           if (eptr >= md->end_subject) return MATCH_NOMATCH;
05922           GETCHARINC(c, eptr);
05923           if (c > 255)
05924             {
05925             if (op == OP_CLASS) return MATCH_NOMATCH;
05926             }
05927           else
05928             {
05929             if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
05930             }
05931           }
05932         }
05933       else
05934 #endif
05935       /* Not UTF-8 mode */
05936         {
05937         for (i = 1; i <= min; i++)
05938           {
05939           if (eptr >= md->end_subject) return MATCH_NOMATCH;
05940           c = *eptr++;
05941           if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
05942           }
05943         }
05944 
05945       /* If max == min we can continue with the main loop without the
05946       need to recurse. */
05947 
05948       if (min == max) continue;
05949 
05950       /* If minimizing, keep testing the rest of the expression and advancing
05951       the pointer while it matches the class. */
05952 
05953       if (minimize)
05954         {
05955 #ifdef SUPPORT_UTF8
05956         /* UTF-8 mode */
05957         if (md->utf8)
05958           {
05959           for (i = min;; i++)
05960             {
05961             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
05962                  MATCH_NOMATCH) return rrc;
05963             if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
05964             GETCHARINC(c, eptr);
05965             if (c > 255)
05966               {
05967               if (op == OP_CLASS) return MATCH_NOMATCH;
05968               }
05969             else
05970               {
05971               if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
05972               }
05973             }
05974           }
05975         else
05976 #endif
05977         /* Not UTF-8 mode */
05978           {
05979           for (i = min;; i++)
05980             {
05981             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
05982                  MATCH_NOMATCH) return rrc;
05983             if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
05984             c = *eptr++;
05985             if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
05986             }
05987           }
05988         /* Control never gets here */
05989         }
05990 
05991       /* If maximizing, find the longest possible run, then work backwards. */
05992 
05993       else
05994         {
05995         const uschar *pp = eptr;
05996 
05997 #ifdef SUPPORT_UTF8
05998         /* UTF-8 mode */
05999         if (md->utf8)
06000           {
06001           for (i = min; i < max; i++)
06002             {
06003             int len = 1;
06004             if (eptr >= md->end_subject) break;
06005             GETCHARLEN(c, eptr, len);
06006             if (c > 255)
06007               {
06008               if (op == OP_CLASS) break;
06009               }
06010             else
06011               {
06012               if ((data[c/8] & (1 << (c&7))) == 0) break;
06013               }
06014             eptr += len;
06015             }
06016           while (eptr >= pp)
06017             {
06018             if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
06019                  MATCH_NOMATCH) return rrc;
06020             BACKCHAR(eptr)
06021             }
06022           }
06023         else
06024 #endif
06025           /* Not UTF-8 mode */
06026           {
06027           for (i = min; i < max; i++)
06028             {
06029             if (eptr >= md->end_subject) break;
06030             c = *eptr;
06031             if ((data[c/8] & (1 << (c&7))) == 0) break;
06032             eptr++;
06033             }
06034           while (eptr >= pp)
06035             {
06036             if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
06037                  MATCH_NOMATCH) return rrc;
06038             }
06039           }
06040 
06041         return MATCH_NOMATCH;
06042         }
06043       }
06044     /* Control never gets here */
06045 
06046 
06047     /* Match an extended character class. This opcode is encountered only
06048     in UTF-8 mode, because that's the only time it is compiled. */
06049 
06050 #ifdef SUPPORT_UTF8
06051     case OP_XCLASS:
06052       {
06053       const uschar *data = ecode + 1 + LINK_SIZE;  /* Save for matching */
06054       ecode += GET(ecode, 1);                      /* Advance past the item */
06055 
06056       switch (*ecode)
06057         {
06058         case OP_CRSTAR:
06059         case OP_CRMINSTAR:
06060         case OP_CRPLUS:
06061         case OP_CRMINPLUS:
06062         case OP_CRQUERY:
06063         case OP_CRMINQUERY:
06064         c = *ecode++ - OP_CRSTAR;
06065         minimize = (c & 1) != 0;
06066         min = rep_min[c];                 /* Pick up values from tables; */
06067         max = rep_max[c];                 /* zero for max => infinity */
06068         if (max == 0) max = INT_MAX;
06069         break;
06070 
06071         case OP_CRRANGE:
06072         case OP_CRMINRANGE:
06073         minimize = (*ecode == OP_CRMINRANGE);
06074         min = GET2(ecode, 1);
06075         max = GET2(ecode, 3);
06076         if (max == 0) max = INT_MAX;
06077         ecode += 5;
06078         break;
06079 
06080         default:               /* No repeat follows */
06081         min = max = 1;
06082         break;
06083         }
06084 
06085       /* First, ensure the minimum number of matches are present. */
06086 
06087       for (i = 1; i <= min; i++)
06088         {
06089         if (eptr >= md->end_subject) return MATCH_NOMATCH;
06090         GETCHARINC(c, eptr);
06091         if (!match_xclass(c, data)) return MATCH_NOMATCH;
06092         }
06093 
06094       /* If max == min we can continue with the main loop without the
06095       need to recurse. */
06096 
06097       if (min == max) continue;
06098 
06099       /* If minimizing, keep testing the rest of the expression and advancing
06100       the pointer while it matches the class. */
06101 
06102       if (minimize)
06103         {
06104         for (i = min;; i++)
06105           {
06106           if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06107                MATCH_NOMATCH) return rrc;
06108           if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
06109           GETCHARINC(c, eptr);
06110           if (!match_xclass(c, data)) return MATCH_NOMATCH;
06111           }
06112         /* Control never gets here */
06113         }
06114 
06115       /* If maximizing, find the longest possible run, then work backwards. */
06116 
06117       else
06118         {
06119         const uschar *pp = eptr;
06120         for (i = min; i < max; i++)
06121           {
06122           int len = 1;
06123           if (eptr >= md->end_subject) break;
06124           GETCHARLEN(c, eptr, len);
06125           if (!match_xclass(c, data)) break;
06126           eptr += len;
06127           }
06128         while (eptr >= pp)
06129           {
06130           if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
06131                MATCH_NOMATCH) return rrc;
06132           BACKCHAR(eptr)
06133           }
06134         return MATCH_NOMATCH;
06135         }
06136 
06137       /* Control never gets here */
06138       }
06139 #endif    /* End of XCLASS */
06140 
06141     /* Match a run of characters */
06142 
06143     case OP_CHARS:
06144       {
06145       register int length = ecode[1];
06146       ecode += 2;
06147 
06148 #ifdef DEBUG    /* Sigh. Some compilers never learn. */
06149       if (eptr >= md->end_subject)
06150         printf("matching subject <null> against pattern ");
06151       else
06152         {
06153         printf("matching subject ");
06154         pchars(eptr, length, TRUE, md);
06155         printf(" against pattern ");
06156         }
06157       pchars(ecode, length, FALSE, md);
06158       printf("\n");
06159 #endif
06160 
06161       if (length > md->end_subject - eptr) return MATCH_NOMATCH;
06162       if ((ims & PCRE_CASELESS) != 0)
06163         {
06164         while (length-- > 0)
06165           if (md->lcc[*ecode++] != md->lcc[*eptr++])
06166             return MATCH_NOMATCH;
06167         }
06168       else
06169         {
06170         while (length-- > 0) if (*ecode++ != *eptr++) return MATCH_NOMATCH;
06171         }
06172       }
06173     break;
06174 
06175     /* Match a single character repeatedly; different opcodes share code. */
06176 
06177     case OP_EXACT:
06178     min = max = GET2(ecode, 1);
06179     ecode += 3;
06180     goto REPEATCHAR;
06181 
06182     case OP_UPTO:
06183     case OP_MINUPTO:
06184     min = 0;
06185     max = GET2(ecode, 1);
06186     minimize = *ecode == OP_MINUPTO;
06187     ecode += 3;
06188     goto REPEATCHAR;
06189 
06190     case OP_STAR:
06191     case OP_MINSTAR:
06192     case OP_PLUS:
06193     case OP_MINPLUS:
06194     case OP_QUERY:
06195     case OP_MINQUERY:
06196     c = *ecode++ - OP_STAR;
06197     minimize = (c & 1) != 0;
06198     min = rep_min[c];                 /* Pick up values from tables; */
06199     max = rep_max[c];                 /* zero for max => infinity */
06200     if (max == 0) max = INT_MAX;
06201 
06202     /* Common code for all repeated single-character matches. We can give
06203     up quickly if there are fewer than the minimum number of characters left in
06204     the subject. */
06205 
06206     REPEATCHAR:
06207 #ifdef SUPPORT_UTF8
06208     if (md->utf8)
06209       {
06210       int len = 1;
06211       const uschar *charptr = ecode;
06212       GETCHARLEN(c, ecode, len);
06213       if (min * len > md->end_subject - eptr) return MATCH_NOMATCH;
06214       ecode += len;
06215 
06216       /* Handle multibyte character matching specially here. There is no
06217       support for any kind of casing for multibyte characters. */
06218 
06219       if (len > 1)
06220         {
06221         for (i = 1; i <= min; i++)
06222           {
06223           if (memcmp(eptr, charptr, len) != 0) return MATCH_NOMATCH;
06224           eptr += len;
06225           }
06226 
06227         if (min == max) continue;
06228 
06229         if (minimize)
06230           {
06231           for (i = min;; i++)
06232             {
06233             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06234                  MATCH_NOMATCH) return rrc;
06235             if (i >= max ||
06236                 eptr >= md->end_subject ||
06237                 memcmp(eptr, charptr, len) != 0)
06238               return MATCH_NOMATCH;
06239             eptr += len;
06240             }
06241           /* Control never gets here */
06242           }
06243         else
06244           {
06245           const uschar *pp = eptr;
06246           for (i = min; i < max; i++)
06247             {
06248             if (eptr > md->end_subject - len ||
06249                 memcmp(eptr, charptr, len) != 0)
06250               break;
06251             eptr += len;
06252             }
06253           while (eptr >= pp)
06254            {
06255            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06256                 MATCH_NOMATCH) return rrc;
06257            eptr -= len;
06258            }
06259           return MATCH_NOMATCH;
06260           }
06261         /* Control never gets here */
06262         }
06263 
06264       /* If the length of a UTF-8 character is 1, we fall through here, and
06265       obey the code as for non-UTF-8 characters below, though in this case the
06266       value of c will always be < 128. */
06267       }
06268     else
06269 #endif
06270 
06271     /* When not in UTF-8 mode, load a single-byte character. */
06272       {
06273       if (min > md->end_subject - eptr) return MATCH_NOMATCH;
06274       c = *ecode++;
06275       }
06276 
06277     /* The value of c at this point is always less than 256, though we may or
06278     may not be in UTF-8 mode. The code is duplicated for the caseless and
06279     caseful cases, for speed, since matching characters is likely to be quite
06280     common. First, ensure the minimum number of matches are present. If min =
06281     max, continue at the same level without recursing. Otherwise, if
06282     minimizing, keep trying the rest of the expression and advancing one
06283     matching character if failing, up to the maximum. Alternatively, if
06284     maximizing, find the maximum number of characters and work backwards. */
06285 
06286     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
06287       max, eptr));
06288 
06289     if ((ims & PCRE_CASELESS) != 0)
06290       {
06291       c = md->lcc[c];
06292       for (i = 1; i <= min; i++)
06293         if (c != md->lcc[*eptr++]) return MATCH_NOMATCH;
06294       if (min == max) continue;
06295       if (minimize)
06296         {
06297         for (i = min;; i++)
06298           {
06299           if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06300                MATCH_NOMATCH) return rrc;
06301           if (i >= max || eptr >= md->end_subject ||
06302               c != md->lcc[*eptr++])
06303             return MATCH_NOMATCH;
06304           }
06305         /* Control never gets here */
06306         }
06307       else
06308         {
06309         const uschar *pp = eptr;
06310         for (i = min; i < max; i++)
06311           {
06312           if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
06313           eptr++;
06314           }
06315         while (eptr >= pp)
06316           if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
06317                MATCH_NOMATCH) return rrc;
06318         return MATCH_NOMATCH;
06319         }
06320       /* Control never gets here */
06321       }
06322 
06323     /* Caseful comparisons (includes all multi-byte characters) */
06324 
06325     else
06326       {
06327       for (i = 1; i <= min; i++) if (c != *eptr++) return MATCH_NOMATCH;
06328       if (min == max) continue;
06329       if (minimize)
06330         {
06331         for (i = min;; i++)
06332           {
06333           if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06334                MATCH_NOMATCH) return rrc;
06335           if (i >= max || eptr >= md->end_subject || c != *eptr++)
06336             return MATCH_NOMATCH;
06337           }
06338         /* Control never gets here */
06339         }
06340       else
06341         {
06342         const uschar *pp = eptr;
06343         for (i = min; i < max; i++)
06344           {
06345           if (eptr >= md->end_subject || c != *eptr) break;
06346           eptr++;
06347           }
06348         while (eptr >= pp)
06349          if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
06350               MATCH_NOMATCH) return rrc;
06351         return MATCH_NOMATCH;
06352         }
06353       }
06354     /* Control never gets here */
06355 
06356     /* Match a negated single one-byte character. The character we are
06357     checking can be multibyte. */
06358 
06359     case OP_NOT:
06360     if (eptr >= md->end_subject) return MATCH_NOMATCH;
06361     ecode++;
06362     GETCHARINCTEST(c, eptr);
06363     if ((ims & PCRE_CASELESS) != 0)
06364       {
06365 #ifdef SUPPORT_UTF8
06366       if (c < 256)
06367 #endif
06368       c = md->lcc[c];
06369       if (md->lcc[*ecode++] == c) return MATCH_NOMATCH;
06370       }
06371     else
06372       {
06373       if (*ecode++ == c) return MATCH_NOMATCH;
06374       }
06375     break;
06376 
06377     /* Match a negated single one-byte character repeatedly. This is almost a
06378     repeat of the code for a repeated single character, but I haven't found a
06379     nice way of commoning these up that doesn't require a test of the
06380     positive/negative option for each character match. Maybe that wouldn't add
06381     very much to the time taken, but character matching *is* what this is all
06382     about... */
06383 
06384     case OP_NOTEXACT:
06385     min = max = GET2(ecode, 1);
06386     ecode += 3;
06387     goto REPEATNOTCHAR;
06388 
06389     case OP_NOTUPTO:
06390     case OP_NOTMINUPTO:
06391     min = 0;
06392     max = GET2(ecode, 1);
06393     minimize = *ecode == OP_NOTMINUPTO;
06394     ecode += 3;
06395     goto REPEATNOTCHAR;
06396 
06397     case OP_NOTSTAR:
06398     case OP_NOTMINSTAR:
06399     case OP_NOTPLUS:
06400     case OP_NOTMINPLUS:
06401     case OP_NOTQUERY:
06402     case OP_NOTMINQUERY:
06403     c = *ecode++ - OP_NOTSTAR;
06404     minimize = (c & 1) != 0;
06405     min = rep_min[c];                 /* Pick up values from tables; */
06406     max = rep_max[c];                 /* zero for max => infinity */
06407     if (max == 0) max = INT_MAX;
06408 
06409     /* Common code for all repeated single-character (less than 255) matches.
06410     We can give up quickly if there are fewer than the minimum number of
06411     characters left in the subject. */
06412 
06413     REPEATNOTCHAR:
06414     if (min > md->end_subject - eptr) return MATCH_NOMATCH;
06415     c = *ecode++;
06416 
06417     /* The code is duplicated for the caseless and caseful cases, for speed,
06418     since matching characters is likely to be quite common. First, ensure the
06419     minimum number of matches are present. If min = max, continue at the same
06420     level without recursing. Otherwise, if minimizing, keep trying the rest of
06421     the expression and advancing one matching character if failing, up to the
06422     maximum. Alternatively, if maximizing, find the maximum number of
06423     characters and work backwards. */
06424 
06425     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
06426       max, eptr));
06427 
06428     if ((ims & PCRE_CASELESS) != 0)
06429       {
06430       c = md->lcc[c];
06431 
06432 #ifdef SUPPORT_UTF8
06433       /* UTF-8 mode */
06434       if (md->utf8)
06435         {
06436         register int d;
06437         for (i = 1; i <= min; i++)
06438           {
06439           GETCHARINC(d, eptr);
06440           if (d < 256) d = md->lcc[d];
06441           if (c == d) return MATCH_NOMATCH;
06442           }
06443         }
06444       else
06445 #endif
06446 
06447       /* Not UTF-8 mode */
06448         {
06449         for (i = 1; i <= min; i++)
06450           if (c == md->lcc[*eptr++]) return MATCH_NOMATCH;
06451         }
06452 
06453       if (min == max) continue;
06454 
06455       if (minimize)
06456         {
06457 #ifdef SUPPORT_UTF8
06458         /* UTF-8 mode */
06459         if (md->utf8)
06460           {
06461           register int d;
06462           for (i = min;; i++)
06463             {
06464             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06465                  MATCH_NOMATCH) return rrc;
06466             GETCHARINC(d, eptr);
06467             if (d < 256) d = md->lcc[d];
06468             if (i >= max || eptr >= md->end_subject || c == d)
06469               return MATCH_NOMATCH;
06470             }
06471           }
06472         else
06473 #endif
06474         /* Not UTF-8 mode */
06475           {
06476           for (i = min;; i++)
06477             {
06478             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06479                  MATCH_NOMATCH) return rrc;
06480             if (i >= max || eptr >= md->end_subject || c == md->lcc[*eptr++])
06481               return MATCH_NOMATCH;
06482             }
06483           }
06484         /* Control never gets here */
06485         }
06486 
06487       /* Maximize case */
06488 
06489       else
06490         {
06491         const uschar *pp = eptr;
06492 
06493 #ifdef SUPPORT_UTF8
06494         /* UTF-8 mode */
06495         if (md->utf8)
06496           {
06497           register int d;
06498           for (i = min; i < max; i++)
06499             {
06500             int len = 1;
06501             if (eptr >= md->end_subject) break;
06502             GETCHARLEN(d, eptr, len);
06503             if (d < 256) d = md->lcc[d];
06504             if (c == d) break;
06505             eptr += len;
06506             }
06507           while (eptr >= pp)
06508             {
06509             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06510                  MATCH_NOMATCH) return rrc;
06511             eptr--;
06512             BACKCHAR(eptr);
06513             }
06514           }
06515         else
06516 #endif
06517         /* Not UTF-8 mode */
06518           {
06519           for (i = min; i < max; i++)
06520             {
06521             if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
06522             eptr++;
06523             }
06524           while (eptr >= pp)
06525             {
06526             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06527                  MATCH_NOMATCH) return rrc;
06528             eptr--;
06529             }
06530           }
06531 
06532         return MATCH_NOMATCH;
06533         }
06534       /* Control never gets here */
06535       }
06536 
06537     /* Caseful comparisons */
06538 
06539     else
06540       {
06541 #ifdef SUPPORT_UTF8
06542       /* UTF-8 mode */
06543       if (md->utf8)
06544         {
06545         register int d;
06546         for (i = 1; i <= min; i++)
06547           {
06548           GETCHARINC(d, eptr);
06549           if (c == d) return MATCH_NOMATCH;
06550           }
06551         }
06552       else
06553 #endif
06554       /* Not UTF-8 mode */
06555         {
06556         for (i = 1; i <= min; i++)
06557           if (c == *eptr++) return MATCH_NOMATCH;
06558         }
06559 
06560       if (min == max) continue;
06561 
06562       if (minimize)
06563         {
06564 #ifdef SUPPORT_UTF8
06565         /* UTF-8 mode */
06566         if (md->utf8)
06567           {
06568           register int d;
06569           for (i = min;; i++)
06570             {
06571             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06572                  MATCH_NOMATCH) return rrc;
06573             GETCHARINC(d, eptr);
06574             if (i >= max || eptr >= md->end_subject || c == d)
06575               return MATCH_NOMATCH;
06576             }
06577           }
06578         else
06579 #endif
06580         /* Not UTF-8 mode */
06581           {
06582           for (i = min;; i++)
06583             {
06584             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06585                  MATCH_NOMATCH) return rrc;
06586             if (i >= max || eptr >= md->end_subject || c == *eptr++)
06587               return MATCH_NOMATCH;
06588             }
06589           }
06590         /* Control never gets here */
06591         }
06592 
06593       /* Maximize case */
06594 
06595       else
06596         {
06597         const uschar *pp = eptr;
06598 
06599 #ifdef SUPPORT_UTF8
06600         /* UTF-8 mode */
06601         if (md->utf8)
06602           {
06603           register int d;
06604           for (i = min; i < max; i++)
06605             {
06606             int len = 1;
06607             if (eptr >= md->end_subject) break;
06608             GETCHARLEN(d, eptr, len);
06609             if (c == d) break;
06610             eptr += len;
06611             }
06612           while (eptr >= pp)
06613             {
06614             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06615                 MATCH_NOMATCH) return rrc;
06616             eptr--;
06617             BACKCHAR(eptr);
06618             }
06619           }
06620         else
06621 #endif
06622         /* Not UTF-8 mode */
06623           {
06624           for (i = min; i < max; i++)
06625             {
06626             if (eptr >= md->end_subject || c == *eptr) break;
06627             eptr++;
06628             }
06629           while (eptr >= pp)
06630             {
06631             if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06632                 MATCH_NOMATCH) return rrc;
06633             eptr--;
06634             }
06635           }
06636 
06637         return MATCH_NOMATCH;
06638         }
06639       }
06640     /* Control never gets here */
06641 
06642     /* Match a single character type repeatedly; several different opcodes
06643     share code. This is very similar to the code for single characters, but we
06644     repeat it in the interests of efficiency. */
06645 
06646     case OP_TYPEEXACT:
06647     min = max = GET2(ecode, 1);
06648     minimize = TRUE;
06649     ecode += 3;
06650     goto REPEATTYPE;
06651 
06652     case OP_TYPEUPTO:
06653     case OP_TYPEMINUPTO:
06654     min = 0;
06655     max = GET2(ecode, 1);
06656     minimize = *ecode == OP_TYPEMINUPTO;
06657     ecode += 3;
06658     goto REPEATTYPE;
06659 
06660     case OP_TYPESTAR:
06661     case OP_TYPEMINSTAR:
06662     case OP_TYPEPLUS:
06663     case OP_TYPEMINPLUS:
06664     case OP_TYPEQUERY:
06665     case OP_TYPEMINQUERY:
06666     c = *ecode++ - OP_TYPESTAR;
06667     minimize = (c & 1) != 0;
06668     min = rep_min[c];                 /* Pick up values from tables; */
06669     max = rep_max[c];                 /* zero for max => infinity */
06670     if (max == 0) max = INT_MAX;
06671 
06672     /* Common code for all repeated single character type matches. Note that
06673     in UTF-8 mode, '.' matches a character of any length, but for the other
06674     character types, the valid characters are all one-byte long. */
06675 
06676     REPEATTYPE:
06677     ctype = *ecode++;      /* Code for the character type */
06678 
06679     /* First, ensure the minimum number of matches are present. Use inline
06680     code for maximizing the speed, and do the type test once at the start
06681     (i.e. keep it out of the loop). Also we can test that there are at least
06682     the minimum number of bytes before we start. This isn't as effective in
06683     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
06684     is tidier. */
06685 
06686     if (min > md->end_subject - eptr) return MATCH_NOMATCH;
06687     if (min > 0)
06688       {
06689 #ifdef SUPPORT_UTF8
06690       if (md->utf8) switch(ctype)
06691         {
06692         case OP_ANY:
06693         for (i = 1; i <= min; i++)
06694           {
06695           if (eptr >= md->end_subject ||
06696              (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0)) /*** FIXME: test LF too? ***/
06697             return MATCH_NOMATCH;
06698           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
06699           }
06700         break;
06701 
06702         case OP_ANYBYTE:
06703         eptr += min;
06704         break;
06705 
06706         case OP_NOT_DIGIT:
06707         for (i = 1; i <= min; i++)
06708           {
06709           if (eptr >= md->end_subject) return MATCH_NOMATCH;
06710           GETCHARINC(c, eptr);
06711           if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
06712             return MATCH_NOMATCH;
06713           }
06714         break;
06715 
06716         case OP_DIGIT:
06717         for (i = 1; i <= min; i++)
06718           {
06719           if (eptr >= md->end_subject ||
06720              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
06721             return MATCH_NOMATCH;
06722           /* No need to skip more bytes - we know it's a 1-byte character */
06723           }
06724         break;
06725 
06726         case OP_NOT_WHITESPACE:
06727         for (i = 1; i <= min; i++)
06728           {
06729           if (eptr >= md->end_subject ||
06730              (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
06731             return MATCH_NOMATCH;
06732           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
06733           }
06734         break;
06735 
06736         case OP_WHITESPACE:
06737         for (i = 1; i <= min; i++)
06738           {
06739           if (eptr >= md->end_subject ||
06740              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
06741             return MATCH_NOMATCH;
06742           /* No need to skip more bytes - we know it's a 1-byte character */
06743           }
06744         break;
06745 
06746         case OP_NOT_WORDCHAR:
06747         for (i = 1; i <= min; i++)
06748           {
06749           if (eptr >= md->end_subject ||
06750              (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
06751             return MATCH_NOMATCH;
06752           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
06753           }
06754         break;
06755 
06756         case OP_WORDCHAR:
06757         for (i = 1; i <= min; i++)
06758           {
06759           if (eptr >= md->end_subject ||
06760              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
06761             return MATCH_NOMATCH;
06762           /* No need to skip more bytes - we know it's a 1-byte character */
06763           }
06764         break;
06765         }
06766       else
06767 #endif
06768 
06769       /* Code for the non-UTF-8 case for minimum matching */
06770 
06771       switch(ctype)
06772         {
06773         case OP_ANY:
06774         if ((ims & PCRE_DOTALL) == 0)
06775           {
06776           for (i = 1; i <= min; i++)
06777             if (*eptr++ == NEWLINE) return MATCH_NOMATCH; /*** FIXME: test LF too? ***/
06778           }
06779         else eptr += min;
06780         break;
06781 
06782         case OP_ANYBYTE:
06783         eptr += min;
06784         break;
06785 
06786         case OP_NOT_DIGIT:
06787         for (i = 1; i <= min; i++)
06788           if ((md->ctypes[*eptr++] & ctype_digit) != 0) return MATCH_NOMATCH;
06789         break;
06790 
06791         case OP_DIGIT:
06792         for (i = 1; i <= min; i++)
06793           if ((md->ctypes[*eptr++] & ctype_digit) == 0) return MATCH_NOMATCH;
06794         break;
06795 
06796         case OP_NOT_WHITESPACE:
06797         for (i = 1; i <= min; i++)
06798           if ((md->ctypes[*eptr++] & ctype_space) != 0) return MATCH_NOMATCH;
06799         break;
06800 
06801         case OP_WHITESPACE:
06802         for (i = 1; i <= min; i++)
06803           if ((md->ctypes[*eptr++] & ctype_space) == 0) return MATCH_NOMATCH;
06804         break;
06805 
06806         case OP_NOT_WORDCHAR:
06807         for (i = 1; i <= min; i++)
06808           if ((md->ctypes[*eptr++] & ctype_word) != 0)
06809             return MATCH_NOMATCH;
06810         break;
06811 
06812         case OP_WORDCHAR:
06813         for (i = 1; i <= min; i++)
06814           if ((md->ctypes[*eptr++] & ctype_word) == 0)
06815             return MATCH_NOMATCH;
06816         break;
06817         }
06818       }
06819 
06820     /* If min = max, continue at the same level without recursing */
06821 
06822     if (min == max) continue;
06823 
06824     /* If minimizing, we have to test the rest of the pattern before each
06825     subsequent match. Again, separate the UTF-8 case for speed. */
06826 
06827     if (minimize)
06828       {
06829 #ifdef SUPPORT_UTF8
06830       /* UTF-8 mode */
06831       if (md->utf8)
06832         {
06833         for (i = min;; i++)
06834           {
06835           if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06836                MATCH_NOMATCH) return rrc;
06837           if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
06838 
06839           GETCHARINC(c, eptr);
06840           switch(ctype)
06841             {
06842             case OP_ANY:
06843             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return MATCH_NOMATCH; /*** FIXME: test LF too? ***/
06844             break;
06845 
06846             case OP_ANYBYTE:
06847             break;
06848 
06849             case OP_NOT_DIGIT:
06850             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
06851               return MATCH_NOMATCH;
06852             break;
06853 
06854             case OP_DIGIT:
06855             if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
06856               return MATCH_NOMATCH;
06857             break;
06858 
06859             case OP_NOT_WHITESPACE:
06860             if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
06861               return MATCH_NOMATCH;
06862             break;
06863 
06864             case OP_WHITESPACE:
06865             if  (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
06866               return MATCH_NOMATCH;
06867             break;
06868 
06869             case OP_NOT_WORDCHAR:
06870             if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
06871               return MATCH_NOMATCH;
06872             break;
06873 
06874             case OP_WORDCHAR:
06875             if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
06876               return MATCH_NOMATCH;
06877             break;
06878             }
06879           }
06880         }
06881       else
06882 #endif
06883       /* Not UTF-8 mode */
06884         {
06885         for (i = min;; i++)
06886           {
06887           if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
06888                MATCH_NOMATCH) return rrc;
06889           if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
06890           c = *eptr++;
06891           switch(ctype)
06892             {
06893             case OP_ANY:
06894             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return MATCH_NOMATCH; /*** FIXME: test LF too? ***/
06895             break;
06896 
06897             case OP_ANYBYTE:
06898             break;
06899 
06900             case OP_NOT_DIGIT:
06901             if ((md->ctypes[c] & ctype_digit) != 0) return MATCH_NOMATCH;
06902             break;
06903 
06904             case OP_DIGIT:
06905             if ((md->ctypes[c] & ctype_digit) == 0) return MATCH_NOMATCH;
06906             break;
06907 
06908             case OP_NOT_WHITESPACE:
06909             if ((md->ctypes[c] & ctype_space) != 0) return MATCH_NOMATCH;
06910             break;
06911 
06912             case OP_WHITESPACE:
06913             if  ((md->ctypes[c] & ctype_space) == 0) return MATCH_NOMATCH;
06914             break;
06915 
06916             case OP_NOT_WORDCHAR:
06917             if ((md->ctypes[c] & ctype_word) != 0) return MATCH_NOMATCH;
06918             break;
06919 
06920             case OP_WORDCHAR:
06921             if ((md->ctypes[c] & ctype_word) == 0) return MATCH_NOMATCH;
06922             break;
06923             }
06924           }
06925         }
06926       /* Control never gets here */
06927       }
06928 
06929     /* If maximizing it is worth using inline code for speed, doing the type
06930     test once at the start (i.e. keep it out of the loop). Again, keep the
06931     UTF-8 stuff separate. */
06932 
06933     else
06934       {
06935       const uschar *pp = eptr;
06936 
06937 #ifdef SUPPORT_UTF8
06938       /* UTF-8 mode */
06939 
06940       if (md->utf8)
06941         {
06942         switch(ctype)
06943           {
06944           case OP_ANY:
06945 
06946           /* Special code is required for UTF8, but when the maximum is unlimited
06947           we don't need it, so we repeat the non-UTF8 code. This is probably
06948           worth it, because .* is quite a common idiom. */
06949 
06950           if (max < INT_MAX)
06951             {
06952             if ((ims & PCRE_DOTALL) == 0)
06953               {
06954               for (i = min; i < max; i++)
06955                 {
06956                 if (eptr >= md->end_subject || *eptr == NEWLINE) break; /*** FIXME: test LF too? ***/
06957                 eptr++;
06958                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
06959                 }
06960               }
06961             else
06962               {
06963               for (i = min; i < max; i++)
06964                 {
06965                 eptr++;
06966                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
06967                 }
06968               }
06969             }
06970 
06971           /* Handle unlimited UTF-8 repeat */
06972 
06973           else
06974             {
06975             if ((ims & PCRE_DOTALL) == 0)
06976               {
06977               for (i = min; i < max; i++)
06978                 {
06979                 if (eptr >= md->end_subject || *eptr == NEWLINE) break; /*** FIXME: test LF too? ***/
06980                 eptr++;
06981                 }
06982               break;
06983               }
06984             else
06985               {
06986               c = max - min;
06987               if (c > md->end_subject - eptr) c = md->end_subject - eptr;
06988               eptr += c;
06989               }
06990             }
06991           break;
06992 
06993           /* The byte case is the same as non-UTF8 */
06994 
06995           case OP_ANYBYTE:
06996           c = max - min;
06997           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
06998           eptr += c;
06999           break;
07000 
07001           case OP_NOT_DIGIT:
07002           for (i = min; i < max; i++)
07003             {
07004             int len = 1;
07005             if (eptr >= md->end_subject) break;
07006             GETCHARLEN(c, eptr, len);
07007             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
07008             eptr+= len;
07009             }
07010           break;
07011 
07012           case OP_DIGIT:
07013           for (i = min; i < max; i++)
07014             {
07015             int len = 1;
07016             if (eptr >= md->end_subject) break;
07017             GETCHARLEN(c, eptr, len);
07018             if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
07019             eptr+= len;
07020             }
07021           break;
07022 
07023           case OP_NOT_WHITESPACE:
07024           for (i = min; i < max; i++)
07025             {
07026             int len = 1;
07027             if (eptr >= md->end_subject) break;
07028             GETCHARLEN(c, eptr, len);
07029             if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
07030             eptr+= len;
07031             }
07032           break;
07033 
07034           case OP_WHITESPACE:
07035           for (i = min; i < max; i++)
07036             {
07037             int len = 1;
07038             if (eptr >= md->end_subject) break;
07039             GETCHARLEN(c, eptr, len);
07040             if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
07041             eptr+= len;
07042             }
07043           break;
07044 
07045           case OP_NOT_WORDCHAR:
07046           for (i = min; i < max; i++)
07047             {
07048             int len = 1;
07049             if (eptr >= md->end_subject) break;
07050             GETCHARLEN(c, eptr, len);
07051             if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
07052             eptr+= len;
07053             }
07054           break;
07055 
07056           case OP_WORDCHAR:
07057           for (i = min; i < max; i++)
07058             {
07059             int len = 1;
07060             if (eptr >= md->end_subject) break;
07061             GETCHARLEN(c, eptr, len);
07062             if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
07063             eptr+= len;
07064             }
07065           break;
07066           }
07067 
07068         /* eptr is now past the end of the maximum run */
07069 
07070         while (eptr >= pp)
07071           {
07072           if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
07073                MATCH_NOMATCH) return rrc;
07074           BACKCHAR(eptr);
07075           }
07076         }
07077       else
07078 #endif
07079 
07080       /* Not UTF-8 mode */
07081         {
07082         switch(ctype)
07083           {
07084           case OP_ANY:
07085           if ((ims & PCRE_DOTALL) == 0)
07086             {
07087             for (i = min; i < max; i++)
07088               {
07089               if (eptr >= md->end_subject || *eptr == NEWLINE) break; /*** FIXME: test LF too? ***/
07090               eptr++;
07091               }
07092             break;
07093             }
07094           /* For DOTALL case, fall through and treat as \C */
07095 
07096           case OP_ANYBYTE:
07097           c = max - min;
07098           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
07099           eptr += c;
07100           break;
07101 
07102           case OP_NOT_DIGIT:
07103           for (i = min; i < max; i++)
07104             {
07105             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
07106               break;
07107             eptr++;
07108             }
07109           break;
07110 
07111           case OP_DIGIT:
07112           for (i = min; i < max; i++)
07113             {
07114             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
07115               break;
07116             eptr++;
07117             }
07118           break;
07119 
07120           case OP_NOT_WHITESPACE:
07121           for (i = min; i < max; i++)
07122             {
07123             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
07124               break;
07125             eptr++;
07126             }
07127           break;
07128 
07129           case OP_WHITESPACE:
07130           for (i = min; i < max; i++)
07131             {
07132             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
07133               break;
07134             eptr++;
07135             }
07136           break;
07137 
07138           case OP_NOT_WORDCHAR:
07139           for (i = min; i < max; i++)
07140             {
07141             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
07142               break;
07143             eptr++;
07144             }
07145           break;
07146 
07147           case OP_WORDCHAR:
07148           for (i = min; i < max; i++)
07149             {
07150             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
07151               break;
07152             eptr++;
07153             }
07154           break;
07155           }
07156 
07157         /* eptr is now past the end of the maximum run */
07158 
07159         while (eptr >= pp)
07160           {
07161           if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
07162                MATCH_NOMATCH) return rrc;
07163           }
07164         }
07165 
07166       /* Get here if we can't make it match with any permitted repetitions */
07167 
07168       return MATCH_NOMATCH;
07169       }
07170     /* Control never gets here */
07171 
07172     /* There's been some horrible disaster. Since all codes > OP_BRA are
07173     for capturing brackets, and there shouldn't be any gaps between 0 and
07174     OP_BRA, arrival here can only mean there is something seriously wrong
07175     in the code above or the OP_xxx definitions. */
07176 
07177     default:
07178     DPRINTF(("Unknown opcode %d\n", *ecode));
07179     return PCRE_ERROR_UNKNOWN_NODE;
07180     }
07181 
07182   /* Do not stick any code in here without much thought; it is assumed
07183   that "continue" in the code above comes out to here to repeat the main
07184   loop. */
07185 
07186   }             /* End of main loop */
07187 /* Control never reaches here */
07188 }
07189 
07190 
07191 
07192 
07193 /*************************************************
07194 *         Execute a Regular Expression           *
07195 *************************************************/
07196 
07197 /* This function applies a compiled re to a subject string and picks out
07198 portions of the string if it matches. Two elements in the vector are set for
07199 each substring: the offsets to the start and end of the substring.
07200 
07201 Arguments:
07202   external_re     points to the compiled expression
07203   extra_data      points to extra data or is NULL
07204   subject         points to the subject string
07205   length          length of subject string (may contain binary zeros)
07206   start_offset    where to start in the subject string
07207   match_length    range of chars where we will attempt to start a match [2003-04-29 AR]
07208   options         option bits
07209   offsets         points to a vector of ints to be filled in with offsets
07210   offsetcount     the number of elements in the vector
07211 
07212 Returns:          > 0 => success; value is the number of elements filled in
07213                   = 0 => success, but offsets is not big enough
07214                    -1 => failed to match
07215                  < -1 => some kind of unexpected problem
07216 
07217 2003-04-29 AR: Added match_length parameter to allow the caller to limit
07218 the part of the subject string where we will try to start a match.
07219 
07220 2003-05-04 AR: Added tables parameter to allow compiled patterns to be
07221 relocated in memory space. We no longer look at the tables field of the
07222 real_pcre struct.
07223 */
07224 
07225 int
07226 pcre_exec(const pcre *external_re, const pcre_extra *extra_data,
07227   const char *subject, int length, int start_offset, int match_length,
07228   int options, int *offsets, int offsetcount, const unsigned char *tables)
07229 {
07230 int rc, resetcount, ocount;
07231 int first_byte = -1;
07232 int req_byte = -1;
07233 int req_byte2 = -1;
07234 unsigned long int ims = 0;
07235 BOOL using_temporary_offsets = FALSE;
07236 BOOL anchored;
07237 BOOL startline;
07238 BOOL first_byte_caseless = FALSE;
07239 BOOL req_byte_caseless = FALSE;
07240 match_data match_block;
07241 const uschar *start_bits = NULL;
07242 const uschar *start_match = (const uschar *)subject + start_offset;
07243 const uschar *end_subject;
07244 const uschar *stop_match;  /*2003-04-29 AR*/
07245 const uschar *req_byte_ptr = start_match - 1;
07246 const pcre_study_data *study;
07247 const real_pcre *re = (const real_pcre *)external_re;
07248 
07249 /* Plausibility checks */
07250 
07251 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
07252 if (re == NULL || subject == NULL ||
07253    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
07254 
07255 /* Fish out the optional data from the extra_data structure, first setting
07256 the default values. */
07257 
07258 study = NULL;
07259 match_block.match_limit = MATCH_LIMIT;
07260 match_block.callout_data = NULL;
07261 
07262 if (extra_data != NULL)
07263   {
07264   register unsigned int flags = extra_data->flags;
07265   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
07266     study = extra_data->study_data;
07267   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
07268     match_block.match_limit = extra_data->match_limit;
07269   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
07270     match_block.callout_data = extra_data->callout_data;
07271   }
07272 
07273 /* Now we have re supposedly pointing to the regex */
07274 
07275 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
07276 
07277 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
07278 startline = (re->options & PCRE_STARTLINE) != 0;
07279 
07280 match_block.start_code =
07281   (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;
07282 match_block.start_subject = (const uschar *)subject;
07283 match_block.start_offset = start_offset;
07284 match_block.end_subject = match_block.start_subject + length;
07285 end_subject = match_block.end_subject;
07286 stop_match = match_block.start_subject + start_offset + match_length;
07287 
07288 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
07289 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
07290 
07291 match_block.notbol = (options & PCRE_NOTBOL) != 0;
07292 match_block.noteol = (options & PCRE_NOTEOL) != 0;
07293 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
07294 
07295 match_block.recursive = NULL;                   /* No recursion at top level */
07296 
07297 match_block.lcc = tables + lcc_offset;          /*2003-05-04 AR*/
07298 match_block.ctypes = tables + ctypes_offset;    /*2003-05-04 AR*/
07299 
07300 /* The ims options can vary during the matching as a result of the presence
07301 of (?ims) items in the pattern. They are kept in a local variable so that
07302 restoring at the exit of a group is easy. */
07303 
07304 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
07305 
07306 /* If the expression has got more back references than the offsets supplied can
07307 hold, we get a temporary bit of working store to use during the matching.
07308 Otherwise, we can use the vector supplied, rounding down its size to a multiple
07309 of 3. */
07310 
07311 ocount = offsetcount - (offsetcount % 3);
07312 
07313 if (re->top_backref > 0 && re->top_backref >= ocount/3)
07314   {
07315   ocount = re->top_backref * 3 + 3;
07316   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
07317   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
07318   using_temporary_offsets = TRUE;
07319   DPRINTF(("Got memory to hold back references\n"));
07320   }
07321 else match_block.offset_vector = offsets;
07322 
07323 match_block.offset_end = ocount;
07324 match_block.offset_max = (2*ocount)/3;
07325 match_block.offset_overflow = FALSE;
07326 match_block.capture_last = -1;
07327 
07328 /* Compute the minimum number of offsets that we need to reset each time. Doing
07329 this makes a huge difference to execution time when there aren't many brackets
07330 in the pattern. */
07331 
07332 resetcount = 2 + re->top_bracket * 2;
07333 if (resetcount > offsetcount) resetcount = ocount;
07334 
07335 /* Reset the working variable associated with each extraction. These should
07336 never be used unless previously set, but they get saved and restored, and so we
07337 initialize them to avoid reading uninitialized locations. */
07338 
07339 if (match_block.offset_vector != NULL)
07340   {
07341   register int *iptr = match_block.offset_vector + ocount;
07342   register int *iend = iptr - resetcount/2 + 1;
07343   while (--iptr >= iend) *iptr = -1;
07344   }
07345 
07346 /* Set up the first character to match, if available. The first_byte value is
07347 never set for an anchored regular expression, but the anchoring may be forced
07348 at run time, so we have to test for anchoring. The first char may be unset for
07349 an unanchored pattern, of course. If there's no first char and the pattern was
07350 studied, there may be a bitmap of possible first characters. */
07351 
07352 if (!anchored)
07353   {
07354   if ((re->options & PCRE_FIRSTSET) != 0)
07355     {
07356     first_byte = re->first_byte & 255;
07357     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
07358       first_byte = match_block.lcc[first_byte];
07359     }
07360   else
07361     if (!startline && study != NULL &&
07362       (study->options & PCRE_STUDY_MAPPED) != 0)
07363         start_bits = study->start_bits;
07364   }
07365 
07366 /* For anchored or unanchored matches, there may be a "last known required
07367 character" set. */
07368 
07369 if ((re->options & PCRE_REQCHSET) != 0)
07370   {
07371   req_byte = re->req_byte & 255;
07372   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
07373   req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */    /*2003-05-04 AR*/
07374   }
07375 
07376 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
07377 the loop runs just once. */
07378 
07379 do
07380   {
07381   register int *iptr = match_block.offset_vector;
07382   register int *iend = iptr + resetcount;
07383 
07384   /* Reset the maximum number of extractions we might see. */
07385 
07386   while (iptr < iend) *iptr++ = -1;
07387 
07388   /* Advance to a unique first char if possible */
07389 
07390   if (first_byte >= 0)
07391     {
07392     if (first_byte_caseless)
07393       while (start_match < stop_match && /*2003-04-29 AR*/
07394              match_block.lcc[*start_match] != first_byte)
07395         start_match++;
07396     else
07397       while (start_match < stop_match && *start_match != first_byte) /*2003-04-29 AR*/
07398         start_match++;
07399     }
07400 
07401   /* Or to just after \n for a multiline match if possible */
07402 
07403   else if (startline)
07404     {
07405     if (start_match > match_block.start_subject + start_offset)
07406       {
07407       while (start_match < stop_match && start_match[-1] != NEWLINE) /*2003-04-29 AR*/ /*** FIXME: test LF too? ***/
07408         start_match++;
07409       }
07410     }
07411 
07412   /* Or to a non-unique first char after study */
07413 
07414   else if (start_bits != NULL)
07415     {
07416     while (start_match < stop_match) /*2003-04-29 AR*/
07417       {
07418       register int c = *start_match;
07419       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
07420       }
07421     }
07422 
07423 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
07424   printf(">>>> Match against: ");
07425   pchars(start_match, end_subject - start_match, TRUE, &match_block);
07426   printf("\n");
07427 #endif
07428 
07429   /* If req_byte is set, we know that that character must appear in the subject
07430   for the match to succeed. If the first character is set, req_byte must be
07431   later in the subject; otherwise the test starts at the match point. This
07432   optimization can save a huge amount of backtracking in patterns with nested
07433   unlimited repeats that aren't going to match. Writing separate code for
07434   cased/caseless versions makes it go faster, as does using an autoincrement
07435   and backing off on a match.
07436 
07437   HOWEVER: when the subject string is very, very long, searching to its end can
07438   take a long time, and give bad performance on quite ordinary patterns. This
07439   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
07440   don't do this when the string is sufficiently long. */
07441 
07442   if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
07443     {
07444     register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
07445 
07446     /* We don't need to repeat the search if we haven't yet reached the
07447     place we found it at last time. */
07448 
07449     if (p > req_byte_ptr)
07450       {
07451       if (req_byte_caseless)
07452         {
07453         while (p < end_subject)
07454           {
07455           register int pp = *p++;
07456           if (pp == req_byte || pp == req_byte2) { p--; break; }
07457           }
07458         }
07459       else
07460         {
07461         while (p < end_subject)
07462           {
07463           if (*p++ == req_byte) { p--; break; }
07464           }
07465         }
07466 
07467       /* If we can't find the required character, break the matching loop */
07468 
07469       if (p >= end_subject) break;
07470 
07471       /* If we have found the required character, save the point where we
07472       found it, so that we don't search again next time round the loop if
07473       the start hasn't passed this character yet. */
07474 
07475       req_byte_ptr = p;
07476       }
07477     }
07478 
07479   /* When a match occurs, substrings will be set for all internal extractions;
07480   we just need to set up the whole thing as substring 0 before returning. If
07481   there were too many extractions, set the return code to zero. In the case
07482   where we had to get some local store to hold offsets for backreferences, copy
07483   those back references that we can. In this case there need not be overflow
07484   if certain parts of the pattern were not used. */
07485 
07486   match_block.start_match = start_match;
07487   match_block.match_call_count = 0;
07488 
07489   rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
07490     match_isgroup);
07491 
07492   if (rc == MATCH_NOMATCH)
07493     {
07494     start_match++;
07495 #ifdef SUPPORT_UTF8
07496     if (match_block.utf8)
07497       while((*start_match & 0xc0) == 0x80) start_match++;
07498 #endif
07499     continue;
07500     }
07501 
07502   if (rc != MATCH_MATCH)
07503     {
07504     DPRINTF((">>>> error: returning %d\n", rc));
07505     return rc;
07506     }
07507 
07508   /* We have a match! Copy the offset information from temporary store if
07509   necessary */
07510 
07511   if (using_temporary_offsets)
07512     {
07513     if (offsetcount >= 4)
07514       {
07515       memcpy(offsets + 2, match_block.offset_vector + 2,
07516         (offsetcount - 2) * sizeof(int));
07517       DPRINTF(("Copied offsets from temporary memory\n"));
07518       }
07519     if (match_block.end_offset_top > offsetcount)
07520       match_block.offset_overflow = TRUE;
07521 
07522     DPRINTF(("Freeing temporary memory\n"));
07523     (pcre_free)(match_block.offset_vector);
07524     }
07525 
07526   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
07527 
07528   if (offsetcount < 2) rc = 0; else
07529     {
07530     offsets[0] = start_match - match_block.start_subject;
07531     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
07532     }
07533 
07534   DPRINTF((">>>> returning %d\n", rc));
07535   return rc;
07536   }
07537 
07538 /* This "while" is the end of the "do" above */
07539 
07540 while (!anchored && start_match <= stop_match); /*2003-04-29 AR*/
07541 
07542 if (using_temporary_offsets)
07543   {
07544   DPRINTF(("Freeing temporary memory\n"));
07545   (pcre_free)(match_block.offset_vector);
07546   }
07547 
07548 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
07549 
07550 return PCRE_ERROR_NOMATCH;
07551 }
07552 
07553 /* End of pcre.c */

Generated on Wed May 31 18:19:59 2006 for frontierkernel 10.1.10a by  doxygen 1.4.6