makedent.cpp
00001 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ 00002 /* enchant 00003 * Copyright (C) 2003 Dom Lachowicz 00004 * 00005 * This library is free software; you can redistribute it and/or 00006 * modify it under the terms of the GNU Lesser General Public 00007 * License as published by the Free Software Foundation; either 00008 * version 2.1 of the License, or (at your option) any later version. 00009 * 00010 * This library is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00013 * Lesser General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Lesser General Public 00016 * License along with this library; if not, write to the 00017 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00018 * Boston, MA 02110-1301, USA. 00019 * 00020 * In addition, as a special exception, Dom Lachowicz 00021 * gives permission to link the code of this program with 00022 * non-LGPL Spelling Provider libraries (eg: a MSFT Office 00023 * spell checker backend) and distribute linked combinations including 00024 * the two. You must obey the GNU Lesser General Public License in all 00025 * respects for all of the code used other than said providers. If you modify 00026 * this file, you may extend this exception to your version of the 00027 * file, but you are not obligated to do so. If you do not wish to 00028 * do so, delete this exception statement from your version. 00029 */ 00030 00031 /* 00032 * Copyright 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA 00033 * All rights reserved. 00034 * 00035 * Redistribution and use in source and binary forms, with or without 00036 * modification, are permitted provided that the following conditions 00037 * are met: 00038 * 00039 * 1. Redistributions of source code must retain the above copyright 00040 * notice, this list of conditions and the following disclaimer. 00041 * 2. Redistributions in binary form must reproduce the above copyright 00042 * notice, this list of conditions and the following disclaimer in the 00043 * documentation and/or other materials provided with the distribution. 00044 * 3. All modifications to the source code must be clearly marked as 00045 * such. Binary redistributions based on modified source code 00046 * must be clearly marked as modified versions in the documentation 00047 * and/or other materials provided with the distribution. 00048 * 4. All advertising materials mentioning features or use of this software 00049 * must display the following acknowledgment: 00050 * This product includes software developed by Geoff Kuenning and 00051 * other unpaid contributors. 00052 * 5. The name of Geoff Kuenning may not be used to endorse or promote 00053 * products derived from this software without specific prior 00054 * written permission. 00055 * 00056 * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND 00057 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00058 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00059 * ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE 00060 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00061 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00062 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00063 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00064 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00065 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00066 * SUCH DAMAGE. 00067 */ 00068 00069 /* 00070 * $Log$ 00071 * Revision 1.2 2004/02/01 04:46:46 zrusin 00072 * Both ispell and aspell plugins are not working properly. We can start switching. 00073 * 00074 * Revision 1.1 2004/01/31 16:44:12 zrusin 00075 * ISpell plugin. 00076 * 00077 * Revision 1.4 2003/08/14 17:51:28 dom 00078 * update license - exception clause should be Lesser GPL 00079 * 00080 * Revision 1.3 2003/07/28 20:40:27 dom 00081 * fix up the license clause, further win32-registry proof some directory getting functions 00082 * 00083 * Revision 1.2 2003/07/16 22:52:49 dom 00084 * LGPL + exception license 00085 * 00086 * Revision 1.1 2003/07/15 01:15:08 dom 00087 * ispell enchant backend 00088 * 00089 * Revision 1.3 2003/02/12 02:10:38 hippietrail 00090 * 00091 * C casts -> C++ casts 00092 * Improved const-correctness due to changing casts 00093 * Fixed some warnings 00094 * 00095 * Revision 1.2 2003/01/29 05:50:12 hippietrail 00096 * 00097 * Fixed my mess in EncodingManager. 00098 * Changed many C casts to C++ casts. 00099 * 00100 * Revision 1.1 2003/01/24 05:52:35 hippietrail 00101 * 00102 * Refactored ispell code. Old ispell global variables had been put into 00103 * an allocated structure, a pointer to which was passed to many functions. 00104 * I have now made all such functions and variables private members of the 00105 * ISpellChecker class. It was C OO, now it's C++ OO. 00106 * 00107 * I've fixed the makefiles and tested compilation but am unable to test 00108 * operation. Please back out my changes if they cause problems which 00109 * are not obvious or easy to fix. 00110 * 00111 * Revision 1.8 2003/01/06 18:48:40 dom 00112 * ispell cleanup, start of using new 'add' save features 00113 * 00114 * Revision 1.7 2003/01/04 19:09:04 dom 00115 * some tidying... bug pissing me off... 00116 * 00117 * Revision 1.6 2002/09/19 05:31:18 hippietrail 00118 * 00119 * More Ispell cleanup. Conditional globals and DEREF macros are removed. 00120 * K&R function declarations removed, converted to Doxygen style comments 00121 * where possible. No code has been changed (I hope). Compiles for me but 00122 * unable to test. 00123 * 00124 * Revision 1.5 2002/09/17 03:03:30 hippietrail 00125 * 00126 * After seeking permission on the developer list I've reformatted all the 00127 * spelling source which seemed to have parts which used 2, 3, 4, and 8 00128 * spaces for tabs. It should all look good with our standard 4-space 00129 * tabs now. 00130 * I've concentrated just on indentation in the actual code. More prettying 00131 * could be done. 00132 * * NO code changes were made * 00133 * 00134 * Revision 1.4 2002/09/13 17:20:13 mpritchett 00135 * Fix more warnings for Linux build 00136 * 00137 * Revision 1.3 2002/03/22 14:31:57 dom 00138 * fix mg's compile problem 00139 * 00140 * Revision 1.2 2001/05/12 16:05:42 thomasf 00141 * Big pseudo changes to ispell to make it pass around a structure rather 00142 * than rely on all sorts of gloabals willy nilly here and there. Also 00143 * fixed our spelling class to work with accepting suggestions once more. 00144 * This code is dirty, gross and ugly (not to mention still not supporting 00145 * multiple hash sized just yet) but it works on my machine and will no 00146 * doubt break other machines. 00147 * 00148 * Revision 1.1 2001/04/15 16:01:24 tomas_f 00149 * moving to spell/xp 00150 * 00151 * Revision 1.6 1999/12/21 18:46:29 sterwill 00152 * ispell patch for non-English dictionaries by Henrik Berg <henrik@lansen.se> 00153 * 00154 * Revision 1.5 1999/10/20 03:19:35 paul 00155 * Hacked ispell code to ignore any characters that don't fit in the lookup tables loaded from the dictionary. It ain't pretty, but at least we don't crash there any more. 00156 * 00157 * Revision 1.4 1999/04/13 17:12:51 jeff 00158 * Applied "Darren O. Benham" <gecko@benham.net> spell check changes. 00159 * Fixed crash on Win32 with the new code. 00160 * 00161 * Revision 1.3 1998/12/29 14:55:33 eric 00162 * 00163 * I've doctored the ispell code pretty extensively here. It is now 00164 * warning-free on Win32. It also *works* on Win32 now, since I 00165 * replaced all the I/O calls with ANSI standard ones. 00166 * 00167 * Revision 1.3 1998/12/29 14:55:33 eric 00168 * 00169 * I've doctored the ispell code pretty extensively here. It is now 00170 * warning-free on Win32. It also *works* on Win32 now, since I 00171 * replaced all the I/O calls with ANSI standard ones. 00172 * 00173 * Revision 1.2 1998/12/28 23:11:30 eric 00174 * 00175 * modified spell code and integration to build on Windows. 00176 * This is still a hack. 00177 * 00178 * Actually, it doesn't yet WORK on Windows. It just builds. 00179 * SpellCheckInit is failing for some reason. 00180 * 00181 * Revision 1.1 1998/12/28 18:04:43 davet 00182 * Spell checker code stripped from ispell. At this point, there are 00183 * two external routines... the Init routine, and a check-a-word routine 00184 * which returns a boolean value, and takes a 16 bit char string. 00185 * The code resembles the ispell code as much as possible still. 00186 * 00187 * Revision 1.45 1994/12/27 23:08:52 geoff 00188 * Add code to makedent to reject words that contain non-word characters. 00189 * This helps protect people who use ISO 8-bit characters when ispell 00190 * isn't configured for that option. 00191 * 00192 * Revision 1.44 1994/10/25 05:46:20 geoff 00193 * Fix some incorrect declarations in the lint versions of some routines. 00194 * 00195 * Revision 1.43 1994/09/16 03:32:34 geoff 00196 * Issue an error message for bad affix flags 00197 * 00198 * Revision 1.42 1994/02/07 04:23:43 geoff 00199 * Correctly identify the deformatter when changing file types 00200 * 00201 * Revision 1.41 1994/01/25 07:11:55 geoff 00202 * Get rid of all old RCS log lines in preparation for the 3.1 release. 00203 * 00204 */ 00205 00206 #include <stdlib.h> 00207 #include <string.h> 00208 #include <ctype.h> 00209 00210 #include "ispell_checker.h" 00211 #include "msgs.h" 00212 00213 int makedent P ((char * lbuf, int lbuflen, struct dent * ent)); 00214 /*int combinecaps P ((struct dent * hdr, struct dent * newent)); 00215 #ifndef NO_CAPITALIZATION_SUPPORT 00216 static void forcevheader P ((struct dent * hdrp, struct dent * oldp, 00217 struct dent * newp)); 00218 #endif / * NO_CAPITALIZATION_SUPPORT * / 00219 static int combine_two_entries P ((struct dent * hdrp, 00220 struct dent * oldp, struct dent * newp)); 00221 static int acoversb P ((struct dent * enta, struct dent * entb)); 00222 */ 00223 /*static int issubset P ((struct dent * ent1, struct dent * ent2)); 00224 static void combineaffixes P ((struct dent * ent1, struct dent * ent2));*/ 00225 00226 void toutent P ((FILE * outfile, struct dent * hent, 00227 int onlykeep)); 00228 /*static void toutword P ((FILE * outfile, char * word, 00229 struct dent * cent)); 00230 static void flagout P ((FILE * outfile, int flag)); 00231 */ 00232 #ifndef ICHAR_IS_CHAR 00233 ichar_t * icharcpy P ((ichar_t * out, ichar_t * in)); 00234 int icharlen P ((ichar_t * str)); 00235 int icharcmp P ((ichar_t * s1, ichar_t * s2)); 00236 int icharncmp P ((ichar_t * s1, ichar_t * s2, int n)); 00237 #endif /* ICHAR_IS_CHAR */ 00238 00239 /*static int has_marker;*/ 00240 00241 /* 00242 * Fill in a directory entry, including setting the capitalization flags, and 00243 * allocate and initialize memory for the d->word field. Returns -1 00244 * if there was trouble. The input word must be in canonical form. 00245 int makedent (lbuf, lbuflen, d) 00246 This function is not used by AbiWord. I don't know if it'll be needed for 00247 other abi documents 00248 */ 00249 00250 #ifndef NO_CAPITALIZATION_SUPPORT 00251 00259 long 00260 ISpellChecker::whatcap (ichar_t *word) 00261 { 00262 register ichar_t * p; 00263 00264 for (p = word; *p; p++) 00265 { 00266 if (mylower (*p)) 00267 break; 00268 } 00269 if (*p == '\0') 00270 return ALLCAPS; 00271 else 00272 { 00273 for ( ; *p; p++) 00274 { 00275 if (myupper (*p)) 00276 break; 00277 } 00278 if (*p == '\0') 00279 { 00280 /* 00281 ** No uppercase letters follow the lowercase ones. 00282 ** If there is more than one uppercase letter, it's 00283 ** "followcase". If only the first one is capitalized, 00284 ** it's "capitalize". If there are no capitals 00285 ** at all, it's ANYCASE. 00286 */ 00287 if (myupper (word[0])) 00288 { 00289 for (p = word + 1; *p != '\0'; p++) 00290 { 00291 if (myupper (*p)) 00292 return FOLLOWCASE; 00293 } 00294 return CAPITALIZED; 00295 } 00296 else 00297 return ANYCASE; 00298 } 00299 else 00300 return FOLLOWCASE; /* .../lower/upper */ 00301 } 00302 } 00303 00312 int ISpellChecker::addvheader ( struct dent *dp) 00313 { 00314 register struct dent * tdent; /* Copy of entry */ 00315 00316 /* 00317 ** Add a second entry with the correct capitalization, and then make 00318 ** dp into a special dummy entry. 00319 */ 00320 tdent = static_cast<struct dent *>(malloc(sizeof (struct dent))); 00321 if (tdent == NULL) 00322 { 00323 fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word); 00324 return -1; 00325 } 00326 *tdent = *dp; 00327 if (captype (tdent->flagfield) != FOLLOWCASE) 00328 tdent->word = NULL; 00329 else 00330 { 00331 /* Followcase words need a copy of the capitalization */ 00332 tdent->word = static_cast<char *>(malloc (static_cast<unsigned int>(strlen(tdent->word)) + 1)); 00333 if (tdent->word == NULL) 00334 { 00335 fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word); 00336 free (reinterpret_cast<char *>(tdent)); 00337 return -1; 00338 } 00339 strcpy (tdent->word, dp->word); 00340 } 00341 chupcase (dp->word); 00342 dp->next = tdent; 00343 dp->flagfield &= ~CAPTYPEMASK; 00344 dp->flagfield |= (ALLCAPS | MOREVARIANTS); 00345 return 0; 00346 } 00347 #endif /* NO_CAPITALIZATION_SUPPORT */ 00348 00349 /* 00350 ** Combine and resolve the entries describing two capitalizations of the same 00351 ** word. This may require allocating yet more entries. 00352 ** 00353 ** Hdrp is a pointer into a hash table. If the word covered by hdrp has 00354 ** variations, hdrp must point to the header. Newp is a pointer to temporary 00355 ** storage, and space is malloc'ed if newp is to be kept. The newp->word 00356 ** field must have been allocated with mymalloc, so that this routine may free 00357 ** the space if it keeps newp but not the word. 00358 ** 00359 ** Return value: 0 if the word was added, 1 if the word was combined 00360 ** with an existing entry, and -1 if trouble occurred (e.g., malloc). 00361 ** If 1 is returned, newp->word may have been be freed using myfree. 00362 ** 00363 ** Life is made much more difficult by the KEEP flag's possibilities. We 00364 ** must ensure that a !KEEP word doesn't find its way into the personal 00365 ** dictionary as a result of this routine's actions. However, a !KEEP 00366 ** word that has affixes must have come from the main dictionary, so it 00367 ** is acceptable to combine entries in that case (got that?). 00368 ** 00369 ** The net result of all this is a set of rules that is a bloody pain 00370 ** to figure out. Basically, we want to choose one of the following actions: 00371 ** 00372 ** (1) Add newp's affixes and KEEP flag to oldp, and discard newp. 00373 ** (2) Add oldp's affixes and KEEP flag to newp, replace oldp with 00374 ** newp, and discard newp. 00375 #ifndef NO_CAPITALIZATION_SUPPORT 00376 ** (3) Insert newp as a new entry in the variants list. If there is 00377 ** currently no variant header, this requires adding one. Adding a 00378 ** header splits into two sub-cases: 00379 ** 00380 ** (3a) If oldp is ALLCAPS and the KEEP flags match, just turn it 00381 ** into the header. 00382 ** (3b) Otherwise, add a new entry to serve as the header. 00383 ** To ease list linking, this is done by copying oldp into 00384 ** the new entry, and then performing (3a). 00385 ** 00386 ** After newp has been added as a variant, its affixes and KEEP 00387 ** flag are OR-ed into the variant header. 00388 #endif 00389 ** 00390 ** So how to choose which? The default is always case (3), which adds newp 00391 ** as a new entry in the variants list. Cases (1) and (2) are symmetrical 00392 ** except for which entry is discarded. We can use case (1) or (2) whenever 00393 ** one entry "covers" the other. "Covering" is defined as follows: 00394 ** 00395 ** (4) For entries with matching capitalization types, A covers B 00396 ** if: 00397 ** 00398 ** (4a) B's affix flags are a subset of A's, or the KEEP flags 00399 ** match, and 00400 ** (4b) either the KEEP flags match, or A's KEEP flag is set. 00401 ** (Since A has more suffixes, combining B with it won't 00402 ** cause any extra suffixes to be added to the dictionary.) 00403 ** (4c) If the words are FOLLOWCASE, the capitalizations match 00404 ** exactly. 00405 ** 00406 #ifndef NO_CAPITALIZATION_SUPPORT 00407 ** (5) For entries with mismatched capitalization types, A covers B 00408 ** if (4a) and (4b) are true, and: 00409 ** 00410 ** (5a) B is ALLCAPS, or 00411 ** (5b) A is ANYCASE, and B is CAPITALIZED. 00412 #endif 00413 ** 00414 ** For any "hdrp" without variants, oldp is the same as hdrp. Otherwise, 00415 ** the above tests are applied using each variant in turn for oldp. 00416 int combinecaps (hdrp, newp) 00417 static void forcevheader (hdrp, oldp, newp) 00418 static int combine_two_entries (hdrp, oldp, newp) 00419 static int acoversb (enta, entb) 00420 */ 00421 00422 /* 00423 * \param s 00424 */ 00425 void 00426 ISpellChecker::upcase (ichar_t *s) 00427 { 00428 00429 while (*s) 00430 { 00431 *s = mytoupper (*s); 00432 s++; 00433 } 00434 } 00435 00436 /* 00437 * \param s 00438 */ 00439 void 00440 ISpellChecker::lowcase (ichar_t *s) 00441 { 00442 00443 while (*s) 00444 { 00445 *s = mytolower (*s); 00446 s++; 00447 } 00448 } 00449 00456 void 00457 ISpellChecker::chupcase (char *s) 00458 { 00459 ichar_t * is; 00460 00461 is = strtosichar (s, 1); 00462 upcase (is); 00463 ichartostr (s, is, strlen (s) + 1, 1); 00464 } 00465 00466 /* 00467 ** See if one affix field is a subset of another. Returns NZ if ent1 00468 ** is a subset of ent2. The KEEP flag is not taken into consideration. 00469 static int issubset (ent1, ent2) 00470 static void combineaffixes (ent1, ent2) 00471 */ 00472 00473 /* 00474 ** Write out a dictionary entry, including capitalization variants. 00475 ** If onlykeep is true, only those variants with KEEP set will be 00476 ** written. 00477 Removed -- not used by Abiword 00478 void toutent_ (toutfile, hent, onlykeep) 00479 static void toutword (toutfile, word, cent) 00480 static void flagout (toutfile, flag) 00481 */ 00482 00498 int 00499 ISpellChecker::stringcharlen (char *bufp, int canonical) 00500 { 00501 #ifdef SLOWMULTIPLY 00502 static char * sp[MAXSTRINGCHARS]; 00503 static int inited = 0; 00504 #endif /* SLOWMULTIPLY */ 00505 register char * bufcur; 00506 register char * stringcur; 00507 register int stringno; 00508 register int lowstringno; 00509 register int highstringno; 00510 int dupwanted; 00511 00512 #ifdef SLOWMULTIPLY 00513 if (!inited) 00514 { 00515 inited = 1; 00516 for (stringno = 0; stringno < MAXSTRINGCHARS; stringno++) 00517 sp[stringno] = &hashheader.stringchars[stringno][0]; 00518 } 00519 #endif /* SLOWMULTIPLY */ 00520 lowstringno = 0; 00521 highstringno = m_hashheader.nstrchars - 1; 00522 dupwanted = canonical ? 0 : m_defdupchar; 00523 while (lowstringno <= highstringno) 00524 { 00525 stringno = (lowstringno + highstringno) >> 1; 00526 #ifdef SLOWMULTIPLY 00527 stringcur = sp[stringno]; 00528 #else /* SLOWMULTIPLY */ 00529 stringcur = &m_hashheader.stringchars[stringno][0]; 00530 #endif /* SLOWMULTIPLY */ 00531 bufcur = bufp; 00532 while (*stringcur) 00533 { 00534 #ifdef NO8BIT 00535 if (((*bufcur++ ^ *stringcur) & 0x7F) != 0) 00536 #else /* NO8BIT */ 00537 if (*bufcur++ != *stringcur) 00538 #endif /* NO8BIT */ 00539 break; 00540 /* 00541 ** We can't use autoincrement above because of the 00542 ** test below. 00543 */ 00544 stringcur++; 00545 } 00546 if (*stringcur == '\0') 00547 { 00548 if (m_hashheader.dupnos[stringno] == dupwanted) 00549 { 00550 /* We have a match */ 00551 m_laststringch = m_hashheader.stringdups[stringno]; 00552 #ifdef SLOWMULTIPLY 00553 return stringcur - sp[stringno]; 00554 #else /* SLOWMULTIPLY */ 00555 return stringcur - &m_hashheader.stringchars[stringno][0]; 00556 #endif /* SLOWMULTIPLY */ 00557 } 00558 else 00559 --stringcur; 00560 } 00561 /* No match - choose which side to search on */ 00562 #ifdef NO8BIT 00563 if ((*--bufcur & 0x7F) < (*stringcur & 0x7F)) 00564 highstringno = stringno - 1; 00565 else if ((*bufcur & 0x7F) > (*stringcur & 0x7F)) 00566 lowstringno = stringno + 1; 00567 #else /* NO8BIT */ 00568 if (*--bufcur < *stringcur) 00569 highstringno = stringno - 1; 00570 else if (*bufcur > *stringcur) 00571 lowstringno = stringno + 1; 00572 #endif /* NO8BIT */ 00573 else if (dupwanted < m_hashheader.dupnos[stringno]) 00574 highstringno = stringno - 1; 00575 else 00576 lowstringno = stringno + 1; 00577 } 00578 m_laststringch = static_cast<unsigned int>(-1); 00579 return 0; /* Not a string character */ 00580 } 00581 00582 /* MACROS CONVERTED TO FUNCTIONS 00583 ** These macros are similar to the ones above, but they take into account 00584 ** the possibility of string characters. Note well that they take a POINTER, 00585 ** not a character. 00586 ** 00587 ** The "l_" versions set "len" to the length of the string character as a 00588 ** handy side effect. (Note that the global "laststringch" is also set, 00589 ** and sometimes used, by these macros.) 00590 ** 00591 ** The "l1_" versions go one step further and guarantee that the "len" 00592 ** field is valid for *all* characters, being set to 1 even if the macro 00593 ** returns false. This macro is a great example of how NOT to write 00594 ** readable C. 00595 */ 00596 #define isstringch(ptr, canon) (isstringstart (*(ptr)) \ 00597 && stringcharlen ((ptr), (canon)) > 0) 00598 /* 00599 int isstringch(char *ptr, int canon) { 00600 return (isstringstart (*(ptr)) && (len = stringcharlen ((ptr), (canon))) > 0); 00601 } 00602 */ 00603 00604 #define l_isstringch(ptr, len, canon) \ 00605 (isstringstart (*(ptr)) \ 00606 && (len = stringcharlen ((ptr), (canon))) \ 00607 > 0) 00608 /* 00609 int l_isstringch(char *ptr, int len, int canon) { 00610 return (isstringstart (*(ptr)) && (len = stringcharlen ((ptr), (canon))) > 0); 00611 } 00612 */ 00613 00614 #define l1_isstringch(ptr, len, canon) \ 00615 (len = 1, \ 00616 isstringstart ((unsigned char)(*(ptr))) \ 00617 && ((len = \ 00618 stringcharlen ((ptr), (canon))) \ 00619 > 0 \ 00620 ? 1 : (len = 1, 0))) 00621 /* 00622 int l1_isstringch(char *ptr, int len, int canon) { 00623 return (len = 1, isstringstart ((unsigned char)(*(ptr))) && 00624 ((len = stringcharlen ((ptr), (canon))) > 0 ? 1 : (len = 1, 0))); 00625 } 00626 */ 00627 00628 /*** END MACRO CONVERSION ***/ 00629 00641 int 00642 ISpellChecker::strtoichar (ichar_t *out, char *in, int outlen, int canonical) 00643 { 00644 register int len = 1; /* Length of next character */ 00645 00646 outlen /= sizeof (ichar_t); /* Convert to an ichar_t count */ 00647 for ( ; --outlen > 0 && *in != '\0'; in += len) 00648 { 00649 if (l1_isstringch (in, len , canonical)) { 00650 *out++ = SET_SIZE + m_laststringch; 00651 } else { 00652 *out++ = (unsigned char)( *in ); 00653 } 00654 } 00655 *out = 0; 00656 return outlen <= 0; 00657 } 00658 00674 int 00675 ISpellChecker::ichartostr ( char *out, ichar_t *in, int outlen, int canonical) 00676 { 00677 register int ch; /* Next character to store */ 00678 register int i; /* Index into duplicates list */ 00679 register char * scharp; /* Pointer into a string char */ 00680 00681 while (--outlen > 0 && (ch = *in++) != 0) 00682 { 00683 if (ch < SET_SIZE) 00684 *out++ = static_cast<char>(ch); 00685 else 00686 { 00687 ch -= SET_SIZE; 00688 if (!canonical) 00689 { 00690 for (i = m_hashheader.nstrchars; --i >= 0; ) 00691 { 00692 if (m_hashheader.dupnos[i] == m_defdupchar 00693 && (static_cast<int>(m_hashheader.stringdups[i])) == ch) 00694 { 00695 ch = i; 00696 break; 00697 } 00698 } 00699 } 00700 scharp = m_hashheader.stringchars[static_cast<unsigned>(ch)]; 00701 while ((*out++ = *scharp++) != '\0') 00702 ; 00703 out--; 00704 } 00705 } 00706 *out = '\0'; 00707 return outlen <= 0; 00708 } 00709 00718 ichar_t * 00719 ISpellChecker::strtosichar ( char *in, int canonical) 00720 { 00721 static ichar_t out[STRTOSICHAR_SIZE / sizeof (ichar_t)]; 00722 00723 if (strtoichar (out, in, sizeof out, canonical)) 00724 fprintf (stderr, WORD_TOO_LONG (in)); 00725 return out; 00726 } 00727 00736 char * 00737 ISpellChecker::ichartosstr (ichar_t *in, int canonical) 00738 { 00739 static char out[ICHARTOSSTR_SIZE]; 00740 00741 if (ichartostr (out, in, sizeof out, canonical)) 00742 fprintf (stderr, WORD_TOO_LONG (out)); 00743 return out; 00744 } 00745 00754 char * 00755 ISpellChecker::printichar (int in) 00756 { 00757 static char out[MAXSTRINGCHARLEN + 1]; 00758 00759 if (in < SET_SIZE) 00760 { 00761 out[0] = static_cast<char>(in); 00762 out[1] = '\0'; 00763 } 00764 else 00765 strcpy (out, m_hashheader.stringchars[static_cast<unsigned>(in) - SET_SIZE]); 00766 return out; 00767 } 00768 00769 #ifndef ICHAR_IS_CHAR 00770 00778 ichar_t * 00779 icharcpy (ichar_t *out, ichar_t *in) 00780 { 00781 ichar_t * origout; /* Copy of destination for return */ 00782 00783 origout = out; 00784 while ((*out++ = *in++) != 0) 00785 ; 00786 return origout; 00787 } 00788 00796 int 00797 icharlen (ichar_t * in) 00798 { 00799 register int len; /* Length so far */ 00800 00801 for (len = 0; *in++ != 0; len++) 00802 ; 00803 return len; 00804 } 00805 00814 int 00815 icharcmp (ichar_t * s1, ichar_t * s2) 00816 { 00817 00818 while (*s1 != 0) 00819 { 00820 if (*s1++ != *s2++) 00821 return *--s1 - *--s2; 00822 } 00823 return *s1 - *s2; 00824 } 00825 00835 int 00836 icharncmp (ichar_t *s1, ichar_t *s2, int n) 00837 { 00838 00839 while (--n >= 0 && *s1 != 0) 00840 { 00841 if (*s1++ != *s2++) 00842 return *--s1 - *--s2; 00843 } 00844 if (n < 0) 00845 return 0; 00846 else 00847 return *s1 - *s2; 00848 } 00849 00850 #endif /* ICHAR_IS_CHAR */ 00851 00852 /* 00853 * \param istate 00854 * \param name 00855 * \param searchnames 00856 * \param deformatter 00857 * 00858 * \return 00859 */ 00860 int 00861 ISpellChecker::findfiletype (const char *name, int searchnames, int *deformatter) 00862 { 00863 char * cp; /* Pointer into suffix list */ 00864 int cplen; /* Length of current suffix */ 00865 register int i; /* Index into type table */ 00866 int len; /* Length of the name */ 00867 00868 /* 00869 * Note: for now, the deformatter is set to 1 for tex, 0 for nroff. 00870 * Further, we assume that it's one or the other, so that a test 00871 * for tex is sufficient. This needs to be generalized. 00872 */ 00873 len = strlen (name); 00874 if (searchnames) 00875 { 00876 for (i = 0; i < m_hashheader.nstrchartype; i++) 00877 { 00878 if (strcmp (name, m_chartypes[i].name) == 0) 00879 { 00880 if (deformatter != NULL) 00881 *deformatter = 00882 (strcmp (m_chartypes[i].deformatter, "tex") == 0); 00883 return i; 00884 } 00885 } 00886 } 00887 for (i = 0; i < m_hashheader.nstrchartype; i++) 00888 { 00889 for (cp = m_chartypes[i].suffixes; *cp != '\0'; cp += cplen + 1) 00890 { 00891 cplen = strlen (cp); 00892 if (len >= cplen && strcmp (&name[len - cplen], cp) == 0) 00893 { 00894 if (deformatter != NULL) 00895 *deformatter = 00896 (strcmp (m_chartypes[i].deformatter, "tex") == 0); 00897 return i; 00898 } 00899 } 00900 } 00901 return -1; 00902 } 00903 00904 /* 00905 HACK: macros replaced with function implementations 00906 so we could do a side-effect-free check for unicode 00907 characters which aren't in hashheader 00908 00909 TODO: this is just a workaround to keep us from crashing. 00910 more sophisticated logic needed here. 00911 */ 00912 char ISpellChecker::myupper(ichar_t c) 00913 { 00914 if (c < (SET_SIZE + MAXSTRINGCHARS)) 00915 return m_hashheader.upperchars[c]; 00916 else 00917 return 0; 00918 } 00919 00920 char ISpellChecker::mylower(ichar_t c) 00921 { 00922 if (c < (SET_SIZE + MAXSTRINGCHARS)) 00923 return m_hashheader.lowerchars[c]; 00924 else 00925 return 0; 00926 } 00927 00928 int myspace(ichar_t c) 00929 { 00930 return ((c > 0) && (c < 0x80) && isspace(static_cast<unsigned char>(c))); 00931 } 00932 00933 char ISpellChecker::iswordch(ichar_t c) 00934 { 00935 if (c < (SET_SIZE + MAXSTRINGCHARS)) 00936 return m_hashheader.wordchars[c]; 00937 else 00938 return 0; 00939 } 00940 00941 char ISpellChecker::isboundarych(ichar_t c) 00942 { 00943 if (c < (SET_SIZE + MAXSTRINGCHARS)) 00944 return m_hashheader.boundarychars[c]; 00945 else 00946 return 0; 00947 } 00948 00949 char ISpellChecker::isstringstart(ichar_t c) 00950 { 00951 if (c < (SET_SIZE)) 00952 return m_hashheader.stringstarts[static_cast<unsigned char>(c)]; 00953 else 00954 return 0; 00955 } 00956 00957 ichar_t ISpellChecker::mytolower(ichar_t c) 00958 { 00959 if (c < (SET_SIZE + MAXSTRINGCHARS)) 00960 return m_hashheader.lowerconv[c]; 00961 else 00962 return c; 00963 } 00964 00965 ichar_t ISpellChecker::mytoupper (ichar_t c) 00966 { 00967 if (c < (SET_SIZE + MAXSTRINGCHARS)) 00968 return m_hashheader.upperconv[c]; 00969 else 00970 return c; 00971 } 00972