Main Page   Modules   Data Structures   File List   Data Fields   Globals   Related Pages  

file/ascmagic.c

Go to the documentation of this file.
00001 /*
00002  * ASCII magic -- file types that we know based on keywords
00003  * that can appear anywhere in the file.
00004  *
00005  * Copyright (c) Ian F. Darwin, 1987.
00006  * Written by Ian F. Darwin.
00007  *
00008  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
00009  * to handle character codes other than ASCII on a unified basis.
00010  *
00011  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
00012  * international characters, now subsumed into this file.
00013  */
00014 
00015 /*
00016  * This software is not subject to any license of the American Telephone
00017  * and Telegraph Company or of the Regents of the University of California.
00018  *
00019  * Permission is granted to anyone to use this software for any purpose on
00020  * any computer system, and to alter it and redistribute it freely, subject
00021  * to the following restrictions:
00022  *
00023  * 1. The author is not responsible for the consequences of use of this
00024  *    software, no matter how awful, even if they arise from flaws in it.
00025  *
00026  * 2. The origin of this software must not be misrepresented, either by
00027  *    explicit claim or by omission.  Since few users ever read sources,
00028  *    credits must appear in the documentation.
00029  *
00030  * 3. Altered versions must be plainly marked as such, and must not be
00031  *    misrepresented as being the original software.  Since few users
00032  *    ever read sources, credits must appear in the documentation.
00033  *
00034  * 4. This notice may not be removed or altered.
00035  */
00036 
00037 #include "system.h"
00038 #include "file.h"
00039 #include "names.h"
00040 #include "tar.h"
00041 #include "debug.h"
00042 
00043 FILE_RCSID("@(#)Id: ascmagic.c,v 1.32 2002/07/03 18:26:37 christos Exp ")
00044 
00045 /*@access fmagic @*/
00046 
00047 /*
00048  * Stolen (by the author!) from the public domain tar program:
00049  * Public Domain version written 26 Aug 1985 John Gilmore (ihnp4!hoptoad!gnu).
00050  */
00051 #define isodigit(c)     ( ((c) >= '0') && ((c) <= '7') )
00052 
00053 /*
00054  * Quick and dirty octal conversion.
00055  *
00056  * Result is -1 if the field is invalid (all blank, or nonoctal).
00057  */
00058 /*@-bounds@*/
00059 static int
00060 from_oct(int digs, char *where)
00061         /*@*/
00062 {
00063         int     value;
00064 
00065         while (isspace((unsigned char)*where)) {        /* Skip spaces */
00066                 where++;
00067                 if (--digs <= 0)
00068                         return -1;              /* All blank field */
00069         }
00070         value = 0;
00071 /*@-shiftimplementation@*/
00072         while (digs > 0 && isodigit(*where)) {  /* Scan til nonoctal */
00073                 value = (value << 3) | (*where++ - '0');
00074                 --digs;
00075         }
00076 /*@=shiftimplementation@*/
00077 
00078         if (digs > 0 && *where && !isspace((unsigned char)*where))
00079                 return -1;                      /* Ended on non-space/nul */
00080 
00081         return value;
00082 }
00083 /*@=bounds@*/
00084 
00085 /*
00086  * Return 
00087  *      0 if the checksum is bad (i.e., probably not a tar archive), 
00088  *      1 for old UNIX tar file,
00089  *      2 for Unix Std (POSIX) tar file.
00090  */
00091 static int
00092 is_tar(const fmagic fm)
00093         /*@*/
00094 {
00095         int nb = fm->nb;
00096         union record *header = (union record *)fm->buf;
00097         int     i;
00098         int     sum, recsum;
00099         char    *p;
00100 
00101         if (nb < sizeof(*header))
00102                 return 0;
00103 
00104         recsum = from_oct(8,  header->header.chksum);
00105 
00106         sum = 0;
00107         p = header->charptr;
00108 /*@-sizeoftype@*/
00109         for (i = sizeof(union record); --i >= 0;)
00110 /*@=sizeoftype@*/
00111         {
00112                 /*
00113                  * We can't use unsigned char here because of old compilers,
00114                  * e.g. V7.
00115                  */
00116                 sum += 0xFF & *p++;
00117         }
00118 
00119         /* Adjust checksum to count the "chksum" field as blanks. */
00120         for (i = sizeof(header->header.chksum); --i >= 0;)
00121                 sum -= 0xFF & header->header.chksum[i];
00122         sum += ' ' * sizeof header->header.chksum;      
00123 
00124         if (sum != recsum)
00125                 return 0;       /* Not a tar archive */
00126         
00127         if (!strcmp(header->header.magic, TARMAGIC)) 
00128                 return 2;               /* Unix Standard tar archive */
00129 
00130         return 1;                       /* Old fashioned tar archive */
00131 }
00132 typedef unsigned long unichar;
00133 
00134 #define MAXLINELEN 300  /* longest sane line length */
00135 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00136                   || (x) == 0x85 || (x) == '\f')
00137 
00138 /*
00139  * This table reflects a particular philosophy about what constitutes
00140  * "text," and there is room for disagreement about it.
00141  *
00142  * Version 3.31 of the file command considered a file to be ASCII if
00143  * each of its characters was approved by either the isascii() or
00144  * isalpha() function.  On most systems, this would mean that any
00145  * file consisting only of characters in the range 0x00 ... 0x7F
00146  * would be called ASCII text, but many systems might reasonably
00147  * consider some characters outside this range to be alphabetic,
00148  * so the file command would call such characters ASCII.  It might
00149  * have been more accurate to call this "considered textual on the
00150  * local system" than "ASCII."
00151  *
00152  * It considered a file to be "International language text" if each
00153  * of its characters was either an ASCII printing character (according
00154  * to the real ASCII standard, not the above test), a character in
00155  * the range 0x80 ... 0xFF, or one of the following control characters:
00156  * backspace, tab, line feed, vertical tab, form feed, carriage return,
00157  * escape.  No attempt was made to determine the language in which files
00158  * of this type were written.
00159  *
00160  *
00161  * The table below considers a file to be ASCII if all of its characters
00162  * are either ASCII printing characters (again, according to the X3.4
00163  * standard, not isascii()) or any of the following controls: bell,
00164  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
00165  *
00166  * I include bell because some programs (particularly shell scripts)
00167  * use it literally, even though it is rare in normal text.  I exclude
00168  * vertical tab because it never seems to be used in real text.  I also
00169  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
00170  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
00171  * character to.  It might be more appropriate to include it in the 8859
00172  * set instead of the ASCII set, but it's got to be included in *something*
00173  * we recognize or EBCDIC files aren't going to be considered textual.
00174  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
00175  * and Latin characters, so these should possibly be allowed.  But they
00176  * make a real mess on VT100-style displays if they're not paired properly,
00177  * so we are probably better off not calling them text.
00178  *
00179  * A file is considered to be ISO-8859 text if its characters are all
00180  * either ASCII, according to the above definition, or printing characters
00181  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
00182  *
00183  * Finally, a file is considered to be international text from some other
00184  * character code if its characters are all either ISO-8859 (according to
00185  * the above definition) or characters in the range 0x80 ... 0x9F, which
00186  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
00187  * consider to be printing characters.
00188  */
00189 
00190 #define F 0   /* character never appears in text */
00191 #define T 1   /* character appears in plain ASCII text */
00192 #define I 2   /* character appears in ISO-8859 text */
00193 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
00194 
00195 /*@unchecked@*/ /*@observer@*/
00196 static char text_chars[256] = {
00197         /*                  BEL BS HT LF    FF CR    */
00198         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
00199         /*                              ESC          */
00200         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
00201         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
00202         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
00203         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
00204         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
00205         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
00206         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
00207         /*            NEL                            */
00208         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
00209         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
00210         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
00211         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
00212         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
00213         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
00214         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
00215         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
00216 };
00217 
00218 /*@-bounds@*/
00219 static int
00220 looks_ascii(const unsigned char *buf, int nb,
00221                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00222         /*@modifies *ubuf, *ulen @*/
00223 {
00224         int i;
00225 
00226         *ulen = 0;
00227 
00228         for (i = 0; i < nb; i++) {
00229                 int t = text_chars[buf[i]];
00230 
00231                 if (t != T)
00232                         return 0;
00233 
00234                 ubuf[(*ulen)++] = buf[i];
00235         }
00236 
00237         return 1;
00238 }
00239 /*@=bounds@*/
00240 
00241 /*@-bounds@*/
00242 static int
00243 looks_latin1(const unsigned char *buf, int nb,
00244                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00245         /*@modifies *ubuf, *ulen @*/
00246 {
00247         int i;
00248 
00249         *ulen = 0;
00250 
00251         for (i = 0; i < nb; i++) {
00252                 int t = text_chars[buf[i]];
00253 
00254                 if (t != T && t != I)
00255                         return 0;
00256 
00257                 ubuf[(*ulen)++] = buf[i];
00258         }
00259 
00260         return 1;
00261 }
00262 /*@=bounds@*/
00263 
00264 /*@-bounds@*/
00265 static int
00266 looks_extended(const unsigned char *buf, int nb,
00267                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00268         /*@modifies *ubuf, *ulen @*/
00269 {
00270         int i;
00271 
00272         *ulen = 0;
00273 
00274         for (i = 0; i < nb; i++) {
00275                 int t = text_chars[buf[i]];
00276 
00277                 if (t != T && t != I && t != X)
00278                         return 0;
00279 
00280                 ubuf[(*ulen)++] = buf[i];
00281         }
00282 
00283         return 1;
00284 }
00285 /*@=bounds@*/
00286 
00287 /*@-bounds@*/
00288 static int
00289 looks_utf8(const unsigned char *buf, int nb,
00290                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00291         /*@modifies *ubuf, *ulen @*/
00292 {
00293         int i, n;
00294         unichar c;
00295         int gotone = 0;
00296 
00297         *ulen = 0;
00298 
00299         for (i = 0; i < nb; i++) {
00300                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
00301                         /*
00302                          * Even if the whole file is valid UTF-8 sequences,
00303                          * still reject it if it uses weird control characters.
00304                          */
00305 
00306                         if (text_chars[buf[i]] != T)
00307                                 return 0;
00308 
00309                         ubuf[(*ulen)++] = buf[i];
00310                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
00311                         return 0;
00312                 } else {                           /* 11xxxxxx begins UTF-8 */
00313                         int following;
00314 
00315                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
00316                                 c = buf[i] & 0x1f;
00317                                 following = 1;
00318                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
00319                                 c = buf[i] & 0x0f;
00320                                 following = 2;
00321                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
00322                                 c = buf[i] & 0x07;
00323                                 following = 3;
00324                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
00325                                 c = buf[i] & 0x03;
00326                                 following = 4;
00327                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
00328                                 c = buf[i] & 0x01;
00329                                 following = 5;
00330                         } else
00331                                 return 0;
00332 
00333                         for (n = 0; n < following; n++) {
00334                                 i++;
00335                                 if (i >= nb)
00336                                         goto done;
00337 
00338                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00339                                         return 0;
00340 
00341                                 c = (c << 6) + (buf[i] & 0x3f);
00342                         }
00343 
00344                         ubuf[(*ulen)++] = c;
00345                         gotone = 1;
00346                 }
00347         }
00348 done:
00349         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
00350 }
00351 /*@=bounds@*/
00352 
00353 /*@-bounds@*/
00354 static int
00355 looks_unicode(const unsigned char *buf, int nb,
00356                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00357         /*@modifies *ubuf, *ulen @*/
00358 {
00359         int bigend;
00360         int i;
00361 
00362         if (nb < 2)
00363                 return 0;
00364 
00365         if (buf[0] == 0xff && buf[1] == 0xfe)
00366                 bigend = 0;
00367         else if (buf[0] == 0xfe && buf[1] == 0xff)
00368                 bigend = 1;
00369         else
00370                 return 0;
00371 
00372         *ulen = 0;
00373 
00374         for (i = 2; i + 1 < nb; i += 2) {
00375                 /* XXX fix to properly handle chars > 65536 */
00376 
00377                 if (bigend)
00378                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00379                 else
00380                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00381 
00382                 if (ubuf[*ulen - 1] == 0xfffe)
00383                         return 0;
00384                 if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T)
00385                         return 0;
00386         }
00387 
00388         return 1;
00389 }
00390 /*@=bounds@*/
00391 
00392 #undef F
00393 #undef T
00394 #undef I
00395 #undef X
00396 
00397 /*
00398  * This table maps each EBCDIC character to an (8-bit extended) ASCII
00399  * character, as specified in the rationale for the dd(1) command in
00400  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
00401  *
00402  * Unfortunately it does not seem to correspond exactly to any of the
00403  * five variants of EBCDIC documented in IBM's _Enterprise Systems
00404  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
00405  * Edition, July, 1999, pp. I-1 - I-4.
00406  *
00407  * Fortunately, though, all versions of EBCDIC, including this one, agree
00408  * on most of the printing characters that also appear in (7-bit) ASCII.
00409  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
00410  *
00411  * Fortunately too, there is general agreement that codes 0x00 through
00412  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
00413  * remainder printing characters.
00414  *
00415  * This is sufficient to allow us to identify EBCDIC text and to distinguish
00416  * between old-style and internationalized examples of text.
00417  */
00418 
00419 /*@unchecked@*/ /*@observer@*/
00420 static unsigned char ebcdic_to_ascii[] = {
00421   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
00422  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
00423 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
00424 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
00425 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00426 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00427 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00428 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00429 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00430 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00431 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00432 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00433 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00434 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00435 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00436 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00437 };
00438 
00439 /*
00440  * The following EBCDIC-to-ASCII table may relate more closely to reality,
00441  * or at least to modern reality.  It comes from
00442  *
00443  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
00444  *
00445  * and maps the characters of EBCDIC code page 1047 (the code used for
00446  * Unix-derived software on IBM's 390 systems) to the corresponding
00447  * characters from ISO 8859-1.
00448  *
00449  * If this table is used instead of the above one, some of the special
00450  * cases for the NEL character can be taken out of the code.
00451  */
00452 
00453 /*@unchecked@*/ /*@unused@*/ /*@observer@*/
00454 static unsigned char ebcdic_1047_to_8859[] = {
00455 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00456 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00457 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00458 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00459 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00460 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00461 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00462 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00463 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00464 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00465 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00466 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00467 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00468 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00469 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00470 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00471 };
00472 
00473 /*
00474  * Copy buf[0 ... nb-1] into out[], translating EBCDIC to ASCII.
00475  */
00476 /*@-bounds@*/
00477 static void
00478 from_ebcdic(const unsigned char *buf, int nb, /*@out@*/ unsigned char *otp)
00479         /*@modifies *otp @*/
00480 {
00481         int i;
00482 
00483         for (i = 0; i < nb; i++) {
00484                 otp[i] = ebcdic_to_ascii[buf[i]];
00485         }
00486 }
00487 /*@=bounds@*/
00488 
00489 /*@-bounds@*/
00490 static int
00491 fmagicAMatch(const unsigned char *s, const unichar *us, int ulen)
00492         /*@*/
00493 {
00494         size_t i;
00495 
00496         for (i = 0; i < ulen; i++) {
00497                 if (s[i] != us[i])
00498                         return 0;
00499         }
00500 
00501         if (s[i])
00502                 return 0;
00503         else
00504                 return 1;
00505 }
00506 /*@=bounds@*/
00507 
00508 /* int nb: size actually read */
00509 /*@-bounds@*/
00510 int
00511 fmagicA(fmagic fm)
00512 {
00513         unsigned char * buf = fm->buf;
00514         int nb = fm->nb;
00515 
00516         char nbuf[HOWMANY+1];           /* one extra for terminating '\0' */
00517         unichar ubuf[HOWMANY+1];        /* one extra for terminating '\0' */
00518         int ulen;
00519         struct names *p;
00520         int i;
00521 
00522         char *code = NULL;
00523         char *code_mime = NULL;
00524         char *type = NULL;
00525         char *subtype = NULL;
00526         char *subtype_mime = NULL;
00527 
00528         int has_escapes = 0;
00529         int has_backspace = 0;
00530 
00531         int n_crlf = 0;
00532         int n_lf = 0;
00533         int n_cr = 0;
00534         int n_nel = 0;
00535 
00536         int last_line_end = -1;
00537         int has_long_lines = 0;
00538 
00539         /*
00540          * Do the tar test first, because if the first file in the tar
00541          * archive starts with a dot, we can confuse it with an nroff file.
00542          */
00543         switch (is_tar(fm)) {
00544         case 1:
00545                 fmagicPrintf(fm, ((fm->flags & FMAGIC_FLAGS_MIME)
00546                         ? "application/x-tar" : "tar archive"));
00547                 return 1;
00548         case 2:
00549                 fmagicPrintf(fm, ((fm->flags & FMAGIC_FLAGS_MIME)
00550                         ? "application/x-tar, POSIX" : "POSIX tar archive"));
00551                 return 1;
00552         }
00553 
00554         /*
00555          * Undo the NUL-termination kindly provided by fmagicProcess()
00556          * but leave at least one byte to look at
00557          */
00558 
00559         while (nb > 1 && buf[nb - 1] == '\0')
00560                 nb--;
00561 
00562         /*
00563          * Then try to determine whether it's any character code we can
00564          * identify.  Each of these tests, if it succeeds, will leave
00565          * the text converted into one-unichar-per-character Unicode in
00566          * ubuf, and the number of characters converted in ulen.
00567          */
00568         if (looks_ascii(buf, nb, ubuf, &ulen)) {
00569                 code = "ASCII";
00570                 code_mime = "us-ascii";
00571                 type = "text";
00572         } else if (looks_utf8(buf, nb, ubuf, &ulen)) {
00573                 code = "UTF-8 Unicode";
00574                 code_mime = "utf-8";
00575                 type = "text";
00576         } else if ((i = looks_unicode(buf, nb, ubuf, &ulen))) {
00577                 if (i == 1)
00578                         code = "Little-endian UTF-16 Unicode";
00579                 else
00580                         code = "Big-endian UTF-16 Unicode";
00581 
00582                 type = "character data";
00583                 code_mime = "utf-16";    /* is this defined? */
00584         } else if (looks_latin1(buf, nb, ubuf, &ulen)) {
00585                 code = "ISO-8859";
00586                 type = "text";
00587                 code_mime = "iso-8859-1"; 
00588         } else if (looks_extended(buf, nb, ubuf, &ulen)) {
00589                 code = "Non-ISO extended-ASCII";
00590                 type = "text";
00591                 code_mime = "unknown";
00592         } else {
00593                 from_ebcdic(buf, nb, nbuf);
00594 
00595                 if (looks_ascii(nbuf, nb, ubuf, &ulen)) {
00596                         code = "EBCDIC";
00597                         type = "character data";
00598                         code_mime = "ebcdic";
00599                 } else if (looks_latin1(nbuf, nb, ubuf, &ulen)) {
00600                         code = "International EBCDIC";
00601                         type = "character data";
00602                         code_mime = "ebcdic";
00603                 } else {
00604                         return 0;  /* doesn't look like text at all */
00605                 }
00606         }
00607 
00608         /*
00609          * for troff, look for . + letter + letter or .\";
00610          * this must be done to disambiguate tar archives' ./file
00611          * and other trash from real troff input.
00612          *
00613          * I believe Plan 9 troff allows non-ASCII characters in the names
00614          * of macros, so this test might possibly fail on such a file.
00615          */
00616         if (*ubuf == '.') {
00617                 unichar *tp = ubuf + 1;
00618 
00619                 while (ISSPC(*tp))
00620                         ++tp;   /* skip leading whitespace */
00621                 if ((tp[0] == '\\' && tp[1] == '\"') ||
00622                     (isascii(tp[0]) && isalnum(tp[0]) &&
00623                      isascii(tp[1]) && isalnum(tp[1]) &&
00624                      ISSPC(tp[2]))) {
00625                         subtype_mime = "text/troff";
00626                         subtype = "troff or preprocessor input";
00627                         goto subtype_identified;
00628                 }
00629         }
00630 
00631         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00632                 subtype_mime = "text/fortran";
00633                 subtype = "fortran program";
00634                 goto subtype_identified;
00635         }
00636 
00637         /* look for tokens from names.h - this is expensive! */
00638 
00639         i = 0;
00640         while (i < ulen) {
00641                 int end;
00642 
00643                 /*
00644                  * skip past any leading space
00645                  */
00646                 while (i < ulen && ISSPC(ubuf[i]))
00647                         i++;
00648                 if (i >= ulen)
00649                         break;
00650 
00651                 /*
00652                  * find the next whitespace
00653                  */
00654                 for (end = i + 1; end < nb; end++)
00655                         if (ISSPC(ubuf[end]))
00656                                 /*@innerbreak@*/ break;
00657 
00658                 /*
00659                  * compare the word thus isolated against the token list
00660                  */
00661 /*@-sizeoftype@*/
00662                 for (p = names; p < names + NNAMES; p++)
00663 /*@=sizeoftype@*/
00664                 {
00665                         if (p->name == NULL)
00666                                 /*@innerbreak@*/ break;
00667                         if (fmagicAMatch(p->name, ubuf + i, end - i)) {
00668                                 subtype = types[p->type].human;
00669                                 subtype_mime = types[p->type].mime;
00670                                 goto subtype_identified;
00671                         }
00672                 }
00673 
00674                 i = end;
00675         }
00676 
00677 subtype_identified:
00678 
00679         /*
00680          * Now try to discover other details about the file.
00681          */
00682         for (i = 0; i < ulen; i++) {
00683                 if (i > last_line_end + MAXLINELEN)
00684                         has_long_lines = 1;
00685 
00686                 if (ubuf[i] == '\033')
00687                         has_escapes = 1;
00688                 if (ubuf[i] == '\b')
00689                         has_backspace = 1;
00690 
00691                 if (ubuf[i] == '\r' && (i + 1 <  ulen && ubuf[i + 1] == '\n')) {
00692                         n_crlf++;
00693                         last_line_end = i;
00694                 }
00695                 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
00696                         n_cr++;
00697                         last_line_end = i;
00698                 }
00699                 if (ubuf[i] == '\n' && (i - 1 <  0    || ubuf[i - 1] != '\r')) {
00700                         n_lf++;
00701                         last_line_end = i;
00702                 }
00703                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
00704                         n_nel++;
00705                         last_line_end = i;
00706                 }
00707         }
00708 
00709         if ((fm->flags & FMAGIC_FLAGS_MIME)) {
00710                 if (subtype_mime != NULL)
00711                         fmagicPrintf(fm, subtype_mime);
00712                 else
00713                         fmagicPrintf(fm, "text/plain");
00714 
00715                 if (code_mime != NULL) {
00716                         fmagicPrintf(fm, "; charset=");
00717                         fmagicPrintf(fm, code_mime);
00718                 }
00719         } else {
00720                 fmagicPrintf(fm, code);
00721 
00722                 if (subtype != NULL) {
00723                         fmagicPrintf(fm, " ");
00724                         fmagicPrintf(fm, subtype);
00725                 }
00726                 fmagicPrintf(fm, " ");
00727                 fmagicPrintf(fm, type);
00728 
00729                 if (has_long_lines)
00730                         fmagicPrintf(fm, ", with very long lines");
00731 
00732                 /*
00733                  * Only report line terminators if we find one other than LF,
00734                  * or if we find none at all.
00735                  */
00736                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00737                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00738                         fmagicPrintf(fm, ", with");
00739 
00740                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
00741                                 fmagicPrintf(fm, " no");
00742                         else {
00743                                 if (n_crlf) {
00744                                         fmagicPrintf(fm, " CRLF");
00745                                         if (n_cr || n_lf || n_nel)
00746                                                 fmagicPrintf(fm, ",");
00747                                 }
00748                                 if (n_cr) {
00749                                         fmagicPrintf(fm, " CR");
00750                                         if (n_lf || n_nel)
00751                                                 fmagicPrintf(fm, ",");
00752                                 }
00753                                 if (n_lf) {
00754                                         fmagicPrintf(fm, " LF");
00755                                         if (n_nel)
00756                                                 fmagicPrintf(fm, ",");
00757                                 }
00758                                 if (n_nel)
00759                                         fmagicPrintf(fm, " NEL");
00760                         }
00761 
00762                         fmagicPrintf(fm, " line terminators");
00763                 }
00764 
00765                 if (has_escapes)
00766                         fmagicPrintf(fm, ", with escape sequences");
00767                 if (has_backspace)
00768                         fmagicPrintf(fm, ", with overstriking");
00769         }
00770 
00771         return 1;
00772 }
00773 /*@=bounds@*/

Generated on Sun Oct 26 13:01:58 2003 for rpm by doxygen1.2.18