study.c

Go to the documentation of this file.
00001 
00002 /*  $Id: study.c 355 2005-01-11 22:48:55Z andreradke $    */
00003 
00004 /*************************************************
00005 *      Perl-Compatible Regular Expressions       *
00006 *************************************************/
00007 
00008 /*
00009 This is a library of functions to support regular expressions whose syntax
00010 and semantics are as close as possible to those of the Perl 5 language. See
00011 the file Tech.Notes for some information on the internals.
00012 
00013 Written by: Philip Hazel <ph10@cam.ac.uk>
00014 
00015            Copyright (c) 1997-2002 University of Cambridge
00016 
00017 -----------------------------------------------------------------------------
00018 Permission is granted to anyone to use this software for any purpose on any
00019 computer system, and to redistribute it freely, subject to the following
00020 restrictions:
00021 
00022 1. This software is distributed in the hope that it will be useful,
00023    but WITHOUT ANY WARRANTY; without even the implied warranty of
00024    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00025 
00026 2. The origin of this software must not be misrepresented, either by
00027    explicit claim or by omission.
00028 
00029 3. Altered versions must be plainly marked as such, and must not be
00030    misrepresented as being the original software.
00031 
00032 4. If PCRE is embedded in any software that is released under the GNU
00033    General Purpose Licence (GPL), then the terms of that licence shall
00034    supersede any condition above with which it is incompatible.
00035 -----------------------------------------------------------------------------
00036 */
00037 
00038 
00039 /* Include the internals header, which itself includes Standard C headers plus
00040 the external pcre header. */
00041 
00042 #include "pcre_internal.h"
00043 
00044 
00045 
00046 /*************************************************
00047 *      Set a bit and maybe its alternate case    *
00048 *************************************************/
00049 
00050 /* Given a character, set its bit in the table, and also the bit for the other
00051 version of a letter if we are caseless.
00052 
00053 Arguments:
00054   start_bits    points to the bit map
00055   c             is the character
00056   caseless      the caseless flag
00057   cd            the block with char table pointers
00058 
00059 Returns:        nothing
00060 */
00061 
00062 static void
00063 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
00064 {
00065 start_bits[c/8] |= (1 << (c&7));
00066 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
00067   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
00068 }
00069 
00070 
00071 
00072 /*************************************************
00073 *          Create bitmap of starting chars       *
00074 *************************************************/
00075 
00076 /* This function scans a compiled unanchored expression and attempts to build a
00077 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
00078 goes by, we may be able to get more clever at doing this.
00079 
00080 Arguments:
00081   code         points to an expression
00082   start_bits   points to a 32-byte table, initialized to 0
00083   caseless     the current state of the caseless flag
00084   utf8         TRUE if in UTF-8 mode
00085   cd           the block with char table pointers
00086 
00087 Returns:       TRUE if table built, FALSE otherwise
00088 */
00089 
00090 static BOOL
00091 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
00092   BOOL utf8, compile_data *cd)
00093 {
00094 register int c;
00095 
00096 /* This next statement and the later reference to dummy are here in order to
00097 trick the optimizer of the IBM C compiler for OS/2 into generating correct
00098 code. Apparently IBM isn't going to fix the problem, and we would rather not
00099 disable optimization (in this module it actually makes a big difference, and
00100 the pcre module can use all the optimization it can get). */
00101 
00102 volatile int dummy;
00103 
00104 do
00105   {
00106   const uschar *tcode = code + 1 + LINK_SIZE;
00107   BOOL try_next = TRUE;
00108 
00109   while (try_next)
00110     {
00111     /* If a branch starts with a bracket or a positive lookahead assertion,
00112     recurse to set bits from within them. That's all for this branch. */
00113 
00114     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
00115       {
00116       if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
00117         return FALSE;
00118       try_next = FALSE;
00119       }
00120 
00121     else switch(*tcode)
00122       {
00123       default:
00124       return FALSE;
00125 
00126       /* Skip over callout */
00127 
00128       case OP_CALLOUT:
00129       tcode += 2;
00130       break;
00131 
00132       /* Skip over extended extraction bracket number */
00133 
00134       case OP_BRANUMBER:
00135       tcode += 3;
00136       break;
00137 
00138       /* Skip over lookbehind and negative lookahead assertions */
00139 
00140       case OP_ASSERT_NOT:
00141       case OP_ASSERTBACK:
00142       case OP_ASSERTBACK_NOT:
00143       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
00144       tcode += 1+LINK_SIZE;
00145       break;
00146 
00147       /* Skip over an option setting, changing the caseless flag */
00148 
00149       case OP_OPT:
00150       caseless = (tcode[1] & PCRE_CASELESS) != 0;
00151       tcode += 2;
00152       break;
00153 
00154       /* BRAZERO does the bracket, but carries on. */
00155 
00156       case OP_BRAZERO:
00157       case OP_BRAMINZERO:
00158       if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
00159         return FALSE;
00160       dummy = 1;
00161       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
00162       tcode += 1+LINK_SIZE;
00163       break;
00164 
00165       /* Single-char * or ? sets the bit and tries the next item */
00166 
00167       case OP_STAR:
00168       case OP_MINSTAR:
00169       case OP_QUERY:
00170       case OP_MINQUERY:
00171       set_bit(start_bits, tcode[1], caseless, cd);
00172       tcode += 2;
00173 #ifdef SUPPORT_UTF8
00174       if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
00175 #endif
00176       break;
00177 
00178       /* Single-char upto sets the bit and tries the next */
00179 
00180       case OP_UPTO:
00181       case OP_MINUPTO:
00182       set_bit(start_bits, tcode[3], caseless, cd);
00183       tcode += 4;
00184 #ifdef SUPPORT_UTF8
00185       if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
00186 #endif
00187       break;
00188 
00189       /* At least one single char sets the bit and stops */
00190 
00191       case OP_EXACT:       /* Fall through */
00192       tcode++;
00193 
00194       case OP_CHARS:       /* Fall through */
00195       tcode++;
00196 
00197       case OP_PLUS:
00198       case OP_MINPLUS:
00199       set_bit(start_bits, tcode[1], caseless, cd);
00200       try_next = FALSE;
00201       break;
00202 
00203       /* Single character type sets the bits and stops */
00204 
00205       case OP_NOT_DIGIT:
00206       for (c = 0; c < 32; c++)
00207         start_bits[c] |= ~cd->cbits[c+cbit_digit];
00208       try_next = FALSE;
00209       break;
00210 
00211       case OP_DIGIT:
00212       for (c = 0; c < 32; c++)
00213         start_bits[c] |= cd->cbits[c+cbit_digit];
00214       try_next = FALSE;
00215       break;
00216 
00217       case OP_NOT_WHITESPACE:
00218       for (c = 0; c < 32; c++)
00219         start_bits[c] |= ~cd->cbits[c+cbit_space];
00220       try_next = FALSE;
00221       break;
00222 
00223       case OP_WHITESPACE:
00224       for (c = 0; c < 32; c++)
00225         start_bits[c] |= cd->cbits[c+cbit_space];
00226       try_next = FALSE;
00227       break;
00228 
00229       case OP_NOT_WORDCHAR:
00230       for (c = 0; c < 32; c++)
00231         start_bits[c] |= ~cd->cbits[c+cbit_word];
00232       try_next = FALSE;
00233       break;
00234 
00235       case OP_WORDCHAR:
00236       for (c = 0; c < 32; c++)
00237         start_bits[c] |= cd->cbits[c+cbit_word];
00238       try_next = FALSE;
00239       break;
00240 
00241       /* One or more character type fudges the pointer and restarts, knowing
00242       it will hit a single character type and stop there. */
00243 
00244       case OP_TYPEPLUS:
00245       case OP_TYPEMINPLUS:
00246       tcode++;
00247       break;
00248 
00249       case OP_TYPEEXACT:
00250       tcode += 3;
00251       break;
00252 
00253       /* Zero or more repeats of character types set the bits and then
00254       try again. */
00255 
00256       case OP_TYPEUPTO:
00257       case OP_TYPEMINUPTO:
00258       tcode += 2;               /* Fall through */
00259 
00260       case OP_TYPESTAR:
00261       case OP_TYPEMINSTAR:
00262       case OP_TYPEQUERY:
00263       case OP_TYPEMINQUERY:
00264       switch(tcode[1])
00265         {
00266         case OP_NOT_DIGIT:
00267         for (c = 0; c < 32; c++)
00268           start_bits[c] |= ~cd->cbits[c+cbit_digit];
00269         break;
00270 
00271         case OP_DIGIT:
00272         for (c = 0; c < 32; c++)
00273           start_bits[c] |= cd->cbits[c+cbit_digit];
00274         break;
00275 
00276         case OP_NOT_WHITESPACE:
00277         for (c = 0; c < 32; c++)
00278           start_bits[c] |= ~cd->cbits[c+cbit_space];
00279         break;
00280 
00281         case OP_WHITESPACE:
00282         for (c = 0; c < 32; c++)
00283           start_bits[c] |= cd->cbits[c+cbit_space];
00284         break;
00285 
00286         case OP_NOT_WORDCHAR:
00287         for (c = 0; c < 32; c++)
00288           start_bits[c] |= ~cd->cbits[c+cbit_word];
00289         break;
00290 
00291         case OP_WORDCHAR:
00292         for (c = 0; c < 32; c++)
00293           start_bits[c] |= cd->cbits[c+cbit_word];
00294         break;
00295         }
00296 
00297       tcode += 2;
00298       break;
00299 
00300       /* Character class where all the information is in a bit map: set the
00301       bits and either carry on or not, according to the repeat count. If it was
00302       a negative class, and we are operating with UTF-8 characters, any byte
00303       with the top-bit set is a potentially valid starter because it may start
00304       a character with a value > 255. (This is sub-optimal in that the
00305       character may be in the range 128-255, and those characters might be
00306       unwanted, but that's as far as we go for the moment.) */
00307 
00308       case OP_NCLASS:
00309       if (utf8) memset(start_bits+16, 0xff, 16);
00310       /* Fall through */
00311 
00312       case OP_CLASS:
00313         {
00314         tcode++;
00315         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
00316         tcode += 32;
00317         switch (*tcode)
00318           {
00319           case OP_CRSTAR:
00320           case OP_CRMINSTAR:
00321           case OP_CRQUERY:
00322           case OP_CRMINQUERY:
00323           tcode++;
00324           break;
00325 
00326           case OP_CRRANGE:
00327           case OP_CRMINRANGE:
00328           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
00329             else try_next = FALSE;
00330           break;
00331 
00332           default:
00333           try_next = FALSE;
00334           break;
00335           }
00336         }
00337       break; /* End of bitmap class handling */
00338 
00339       }      /* End of switch */
00340     }        /* End of try_next loop */
00341 
00342   code += GET(code, 1);   /* Advance to next branch */
00343   }
00344 while (*code == OP_ALT);
00345 return TRUE;
00346 }
00347 
00348 
00349 
00350 /*************************************************
00351 *          Study a compiled expression           *
00352 *************************************************/
00353 
00354 /* This function is handed a compiled expression that it must study to produce
00355 information that will speed up the matching. It returns a pcre_extra block
00356 which then gets handed back to pcre_exec().
00357 
00358 Arguments:
00359   re        points to the compiled expression
00360   options   contains option bits
00361   errorptr  points to where to place error messages;
00362             set NULL unless error
00363 
00364 Returns:    pointer to a pcre_extra block, with study_data filled in and the
00365               appropriate flag set;
00366             NULL on error or if no optimization possible
00367 */
00368 
00369 pcre_extra *
00370 pcre_study(const pcre *external_re, int options, const char **errorptr)
00371 {
00372 uschar start_bits[32];
00373 pcre_extra *extra;
00374 pcre_study_data *study;
00375 const real_pcre *re = (const real_pcre *)external_re;
00376 uschar *code = (uschar *)re + sizeof(real_pcre) +
00377   (re->name_count * re->name_entry_size);
00378 compile_data compile_block;
00379 
00380 *errorptr = NULL;
00381 
00382 if (re == NULL || re->magic_number != MAGIC_NUMBER)
00383   {
00384   *errorptr = "argument is not a compiled regular expression";
00385   return NULL;
00386   }
00387 
00388 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
00389   {
00390   *errorptr = "unknown or incorrect option bit(s) set";
00391   return NULL;
00392   }
00393 
00394 /* For an anchored pattern, or an unanchored pattern that has a first char, or
00395 a multiline pattern that matches only at "line starts", no further processing
00396 at present. */
00397 
00398 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
00399   return NULL;
00400 
00401 /* Set the character tables in the block which is passed around */
00402 
00403 compile_block.lcc = re->tables + lcc_offset;
00404 compile_block.fcc = re->tables + fcc_offset;
00405 compile_block.cbits = re->tables + cbits_offset;
00406 compile_block.ctypes = re->tables + ctypes_offset;
00407 
00408 /* See if we can find a fixed set of initial characters for the pattern. */
00409 
00410 memset(start_bits, 0, 32 * sizeof(uschar));
00411 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
00412   (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
00413 
00414 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
00415 the latter, which is pointed to by the former, which may also get additional
00416 data set later by the calling program. At the moment, the size of
00417 pcre_study_data is fixed. We nevertheless save it in a field for returning via
00418 the pcre_fullinfo() function so that if it becomes variable in the future, we
00419 don't have to change that code. */
00420 
00421 extra = (pcre_extra *)(pcre_malloc)
00422   (sizeof(pcre_extra) + sizeof(pcre_study_data));
00423 
00424 if (extra == NULL)
00425   {
00426   *errorptr = "failed to get memory";
00427   return NULL;
00428   }
00429 
00430 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
00431 extra->flags = PCRE_EXTRA_STUDY_DATA;
00432 extra->study_data = study;
00433 
00434 study->size = sizeof(pcre_study_data);
00435 study->options = PCRE_STUDY_MAPPED;
00436 memcpy(study->start_bits, start_bits, sizeof(start_bits));
00437 
00438 return extra;
00439 }
00440 
00441 /* End of study.c */

Generated on Wed May 31 18:20:02 2006 for frontierkernel 10.1.10a by  doxygen 1.4.6