regexp.cpp

00001 // -*- c-basic-offset: 2 -*-
00002 /*
00003  *  This file is part of the KDE libraries
00004  *  Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
00005  *
00006  *  This library is free software; you can redistribute it and/or
00007  *  modify it under the terms of the GNU Lesser General Public
00008  *  License as published by the Free Software Foundation; either
00009  *  version 2 of the License, or (at your option) any later version.
00010  *
00011  *  This library is distributed in the hope that it will be useful,
00012  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  *  Lesser General Public License for more details.
00015  *
00016  *  You should have received a copy of the GNU Lesser General Public
00017  *  License along with this library; if not, write to the Free Software
00018  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00019  *
00020  */
00021 
00022 #include "regexp.h"
00023 
00024 #include "lexer.h"
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 
00029 using namespace KJS;
00030 
00031 RegExp::RegExp(const UString &p, int f)
00032   : pat(p), flgs(f), m_notEmpty(false), valid(true)
00033 {
00034   nrSubPatterns = 0; // determined in match() with POSIX regex.
00035 
00036   // JS regexps can contain Unicode escape sequences (\uxxxx) which
00037   // are rather uncommon elsewhere. As our regexp libs don't understand
00038   // them we do the unescaping ourselves internally.
00039   UString intern;
00040   if (p.find('\\') >= 0) {
00041     bool escape = false;
00042     for (int i = 0; i < p.size(); ++i) {
00043       UChar c = p[i];
00044       if (escape) {
00045         escape = false;
00046         // we only care about \uxxxx
00047         if (c == 'u' && i + 4 < p.size()) {
00048           int c0 = p[i+1].unicode();
00049           int c1 = p[i+2].unicode();
00050           int c2 = p[i+3].unicode();
00051           int c3 = p[i+4].unicode();
00052           if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
00053               Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
00054             c = Lexer::convertUnicode(c0, c1, c2, c3);
00055             intern += UString(&c, 1);
00056             i += 4;
00057             continue;
00058           }
00059         }
00060         intern += UString('\\');
00061         intern += UString(&c, 1);
00062       } else {
00063         if (c == '\\')
00064           escape = true;
00065         else
00066           intern += UString(&c, 1);
00067       }
00068     }
00069   } else {
00070     intern = p;
00071   }
00072 
00073 #ifdef HAVE_PCREPOSIX
00074   int pcreflags = 0;
00075   const char *perrormsg;
00076   int errorOffset;
00077 
00078   if (flgs & IgnoreCase)
00079     pcreflags |= PCRE_CASELESS;
00080 
00081   if (flgs & Multiline)
00082     pcreflags |= PCRE_MULTILINE;
00083 
00084   pcregex = pcre_compile(intern.ascii(), pcreflags,
00085              &perrormsg, &errorOffset, NULL);
00086   if (!pcregex) {
00087 #ifndef NDEBUG
00088     fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00089 #endif
00090     valid = false;
00091     return;
00092   }
00093 
00094 #ifdef PCRE_INFO_CAPTURECOUNT
00095   // Get number of subpatterns that will be returned
00096   int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00097   if (rc != 0)
00098 #endif
00099     nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
00100 
00101 #else /* HAVE_PCREPOSIX */
00102 
00103   int regflags = 0;
00104 #ifdef REG_EXTENDED
00105   regflags |= REG_EXTENDED;
00106 #endif
00107 #ifdef REG_ICASE
00108   if ( f & IgnoreCase )
00109     regflags |= REG_ICASE;
00110 #endif
00111 
00112   //NOTE: Multiline is not feasible with POSIX regex.
00113   //if ( f & Multiline )
00114   //    ;
00115   // Note: the Global flag is already handled by RegExpProtoFunc::execute
00116 
00117   int errorCode = regcomp(&preg, intern.ascii(), regflags);
00118   if (errorCode != 0) {
00119 #ifndef NDEBUG
00120     char errorMessage[80];
00121     regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
00122     fprintf(stderr, "KJS: regcomp failed with '%s'", errorMessage);
00123 #endif
00124     valid = false;
00125   }
00126 #endif
00127 }
00128 
00129 RegExp::~RegExp()
00130 {
00131 #ifdef HAVE_PCREPOSIX
00132   if (pcregex)
00133     pcre_free(pcregex);
00134 #else
00135   /* TODO: is this really okay after an error ? */
00136   regfree(&preg);
00137 #endif
00138 }
00139 
00140 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00141 {
00142   if (i < 0)
00143     i = 0;
00144   if (ovector)
00145     *ovector = 0L;
00146   int dummyPos;
00147   if (!pos)
00148     pos = &dummyPos;
00149   *pos = -1;
00150   if (i > s.size() || s.isNull())
00151     return UString::null;
00152 
00153 #ifdef HAVE_PCREPOSIX
00154   CString buffer(s.cstring());
00155   int bufferSize = buffer.size();
00156   int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
00157   if (ovector) *ovector = new int[ovecsize];
00158   if (!pcregex)
00159     return UString::null;
00160 
00161   if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i,
00162                 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0, // see man pcretest
00163                 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00164   {
00165     // Failed to match.
00166     if ((flgs & Global) && m_notEmpty && ovector)
00167     {
00168       // We set m_notEmpty ourselves, to look for a non-empty match
00169       // (see man pcretest or pcretest.c for details).
00170       // So we don't stop here, we want to try again at i+1.
00171 #ifdef KJS_VERBOSE
00172       fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00173 #endif
00174       m_notEmpty = 0;
00175       if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i+1, 0,
00176                     ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00177         return UString::null;
00178     }
00179     else // done
00180       return UString::null;
00181   }
00182 
00183   // Got a match, proceed with it.
00184 
00185   if (!ovector)
00186     return UString::null; // don't rely on the return value if you pass ovector==0
00187 #else
00188   const uint maxMatch = 10;
00189   regmatch_t rmatch[maxMatch];
00190 
00191   char *str = strdup(s.ascii()); // TODO: why ???
00192   if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00193     free(str);
00194     return UString::null;
00195   }
00196   free(str);
00197 
00198   if (!ovector) {
00199     *pos = rmatch[0].rm_so + i;
00200     return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00201   }
00202 
00203   // map rmatch array to ovector used in PCRE case
00204   nrSubPatterns = 0;
00205   for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
00206     nrSubPatterns++;
00207     // if the nonEmpty flag is set, return a failed match if any of the
00208     // subMatches happens to be an empty string.
00209     if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo) 
00210       return UString::null;
00211   }
00212   // Allow an ovector slot to return the (failed) match result.
00213   if (nrSubPatterns == 0) nrSubPatterns = 1;
00214   
00215   int ovecsize = (nrSubPatterns)*3; // see above
00216   *ovector = new int[ovecsize];
00217   for (uint j = 0; j < nrSubPatterns; j++) {
00218       (*ovector)[2*j] = rmatch[j].rm_so + i;
00219       (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00220   }
00221 #endif
00222 
00223   *pos = (*ovector)[0];
00224   if ( *pos == (*ovector)[1] && (flgs & Global) )
00225   {
00226     // empty match, next try will be with m_notEmpty=true
00227     m_notEmpty=true;
00228   }
00229   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00230 }
00231 
00232 #if 0 // unused
00233 bool RegExp::test(const UString &s, int)
00234 {
00235 #ifdef HAVE_PCREPOSIX
00236   int ovector[300];
00237   CString buffer(s.cstring());
00238 
00239   if (s.isNull() ||
00240       pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00241         0, ovector, 300) == PCRE_ERROR_NOMATCH)
00242     return false;
00243   else
00244     return true;
00245 
00246 #else
00247 
00248   char *str = strdup(s.ascii());
00249   int r = regexec(&preg, str, 0, 0, 0);
00250   free(str);
00251 
00252   return r == 0;
00253 #endif
00254 }
00255 #endif
KDE Home | KDE Accessibility Home | Description of Access Keys