00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include "system.h"
00038 #include "file.h"
00039 #include "names.h"
00040 #include "tar.h"
00041 #include "debug.h"
00042
00043 FILE_RCSID("@(#)Id: ascmagic.c,v 1.32 2002/07/03 18:26:37 christos Exp ")
00044
00045
00046
00047
00048
00049
00050
00051 #define isodigit(c) ( ((c) >= '0') && ((c) <= '7') )
00052
00053
00054
00055
00056
00057
00058
00059 static int
00060 from_oct(int digs, char *where)
00061
00062 {
00063 int value;
00064
00065 while (isspace((unsigned char)*where)) {
00066 where++;
00067 if (--digs <= 0)
00068 return -1;
00069 }
00070 value = 0;
00071
00072 while (digs > 0 && isodigit(*where)) {
00073 value = (value << 3) | (*where++ - '0');
00074 --digs;
00075 }
00076
00077
00078 if (digs > 0 && *where && !isspace((unsigned char)*where))
00079 return -1;
00080
00081 return value;
00082 }
00083
00084
00085
00086
00087
00088
00089
00090
00091 static int
00092 is_tar(const fmagic fm)
00093
00094 {
00095 int nb = fm->nb;
00096 union record *header = (union record *)fm->buf;
00097 int i;
00098 int sum, recsum;
00099 char *p;
00100
00101 if (nb < sizeof(*header))
00102 return 0;
00103
00104 recsum = from_oct(8, header->header.chksum);
00105
00106 sum = 0;
00107 p = header->charptr;
00108
00109 for (i = sizeof(union record); --i >= 0;)
00110
00111 {
00112
00113
00114
00115
00116 sum += 0xFF & *p++;
00117 }
00118
00119
00120 for (i = sizeof(header->header.chksum); --i >= 0;)
00121 sum -= 0xFF & header->header.chksum[i];
00122 sum += ' ' * sizeof header->header.chksum;
00123
00124 if (sum != recsum)
00125 return 0;
00126
00127 if (!strcmp(header->header.magic, TARMAGIC))
00128 return 2;
00129
00130 return 1;
00131 }
00132 typedef unsigned long unichar;
00133
00134 #define MAXLINELEN 300
00135 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00136 || (x) == 0x85 || (x) == '\f')
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190 #define F 0
00191 #define T 1
00192 #define I 2
00193 #define X 3
00194
00195
00196 static char text_chars[256] = {
00197
00198 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,
00199
00200 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,
00201 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00202 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00203 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00204 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00205 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00206 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,
00207
00208 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,
00209 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,
00210 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00211 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00212 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00213 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00214 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00215 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I
00216 };
00217
00218
00219 static int
00220 looks_ascii(const unsigned char *buf, int nb,
00221 unichar *ubuf, int *ulen)
00222
00223 {
00224 int i;
00225
00226 *ulen = 0;
00227
00228 for (i = 0; i < nb; i++) {
00229 int t = text_chars[buf[i]];
00230
00231 if (t != T)
00232 return 0;
00233
00234 ubuf[(*ulen)++] = buf[i];
00235 }
00236
00237 return 1;
00238 }
00239
00240
00241
00242 static int
00243 looks_latin1(const unsigned char *buf, int nb,
00244 unichar *ubuf, int *ulen)
00245
00246 {
00247 int i;
00248
00249 *ulen = 0;
00250
00251 for (i = 0; i < nb; i++) {
00252 int t = text_chars[buf[i]];
00253
00254 if (t != T && t != I)
00255 return 0;
00256
00257 ubuf[(*ulen)++] = buf[i];
00258 }
00259
00260 return 1;
00261 }
00262
00263
00264
00265 static int
00266 looks_extended(const unsigned char *buf, int nb,
00267 unichar *ubuf, int *ulen)
00268
00269 {
00270 int i;
00271
00272 *ulen = 0;
00273
00274 for (i = 0; i < nb; i++) {
00275 int t = text_chars[buf[i]];
00276
00277 if (t != T && t != I && t != X)
00278 return 0;
00279
00280 ubuf[(*ulen)++] = buf[i];
00281 }
00282
00283 return 1;
00284 }
00285
00286
00287
00288 static int
00289 looks_utf8(const unsigned char *buf, int nb,
00290 unichar *ubuf, int *ulen)
00291
00292 {
00293 int i, n;
00294 unichar c;
00295 int gotone = 0;
00296
00297 *ulen = 0;
00298
00299 for (i = 0; i < nb; i++) {
00300 if ((buf[i] & 0x80) == 0) {
00301
00302
00303
00304
00305
00306 if (text_chars[buf[i]] != T)
00307 return 0;
00308
00309 ubuf[(*ulen)++] = buf[i];
00310 } else if ((buf[i] & 0x40) == 0) {
00311 return 0;
00312 } else {
00313 int following;
00314
00315 if ((buf[i] & 0x20) == 0) {
00316 c = buf[i] & 0x1f;
00317 following = 1;
00318 } else if ((buf[i] & 0x10) == 0) {
00319 c = buf[i] & 0x0f;
00320 following = 2;
00321 } else if ((buf[i] & 0x08) == 0) {
00322 c = buf[i] & 0x07;
00323 following = 3;
00324 } else if ((buf[i] & 0x04) == 0) {
00325 c = buf[i] & 0x03;
00326 following = 4;
00327 } else if ((buf[i] & 0x02) == 0) {
00328 c = buf[i] & 0x01;
00329 following = 5;
00330 } else
00331 return 0;
00332
00333 for (n = 0; n < following; n++) {
00334 i++;
00335 if (i >= nb)
00336 goto done;
00337
00338 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00339 return 0;
00340
00341 c = (c << 6) + (buf[i] & 0x3f);
00342 }
00343
00344 ubuf[(*ulen)++] = c;
00345 gotone = 1;
00346 }
00347 }
00348 done:
00349 return gotone;
00350 }
00351
00352
00353
00354 static int
00355 looks_unicode(const unsigned char *buf, int nb,
00356 unichar *ubuf, int *ulen)
00357
00358 {
00359 int bigend;
00360 int i;
00361
00362 if (nb < 2)
00363 return 0;
00364
00365 if (buf[0] == 0xff && buf[1] == 0xfe)
00366 bigend = 0;
00367 else if (buf[0] == 0xfe && buf[1] == 0xff)
00368 bigend = 1;
00369 else
00370 return 0;
00371
00372 *ulen = 0;
00373
00374 for (i = 2; i + 1 < nb; i += 2) {
00375
00376
00377 if (bigend)
00378 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00379 else
00380 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00381
00382 if (ubuf[*ulen - 1] == 0xfffe)
00383 return 0;
00384 if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T)
00385 return 0;
00386 }
00387
00388 return 1;
00389 }
00390
00391
00392 #undef F
00393 #undef T
00394 #undef I
00395 #undef X
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420 static unsigned char ebcdic_to_ascii[] = {
00421 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
00422 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
00423 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
00424 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
00425 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00426 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00427 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00428 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00429 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00430 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00431 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00432 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00433 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00434 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00435 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00436 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00437 };
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454 static unsigned char ebcdic_1047_to_8859[] = {
00455 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00456 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00457 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00458 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00459 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00460 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00461 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00462 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00463 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00464 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00465 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00466 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00467 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00468 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00469 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00470 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00471 };
00472
00473
00474
00475
00476
00477 static void
00478 from_ebcdic(const unsigned char *buf, int nb, unsigned char *otp)
00479
00480 {
00481 int i;
00482
00483 for (i = 0; i < nb; i++) {
00484 otp[i] = ebcdic_to_ascii[buf[i]];
00485 }
00486 }
00487
00488
00489
00490 static int
00491 fmagicAMatch(const unsigned char *s, const unichar *us, int ulen)
00492
00493 {
00494 size_t i;
00495
00496 for (i = 0; i < ulen; i++) {
00497 if (s[i] != us[i])
00498 return 0;
00499 }
00500
00501 if (s[i])
00502 return 0;
00503 else
00504 return 1;
00505 }
00506
00507
00508
00509
00510 int
00511 fmagicA(fmagic fm)
00512 {
00513 unsigned char * buf = fm->buf;
00514 int nb = fm->nb;
00515
00516 char nbuf[HOWMANY+1];
00517 unichar ubuf[HOWMANY+1];
00518 int ulen;
00519 struct names *p;
00520 int i;
00521
00522 char *code = NULL;
00523 char *code_mime = NULL;
00524 char *type = NULL;
00525 char *subtype = NULL;
00526 char *subtype_mime = NULL;
00527
00528 int has_escapes = 0;
00529 int has_backspace = 0;
00530
00531 int n_crlf = 0;
00532 int n_lf = 0;
00533 int n_cr = 0;
00534 int n_nel = 0;
00535
00536 int last_line_end = -1;
00537 int has_long_lines = 0;
00538
00539
00540
00541
00542
00543 switch (is_tar(fm)) {
00544 case 1:
00545 fmagicPrintf(fm, ((fm->flags & FMAGIC_FLAGS_MIME)
00546 ? "application/x-tar" : "tar archive"));
00547 return 1;
00548 case 2:
00549 fmagicPrintf(fm, ((fm->flags & FMAGIC_FLAGS_MIME)
00550 ? "application/x-tar, POSIX" : "POSIX tar archive"));
00551 return 1;
00552 }
00553
00554
00555
00556
00557
00558
00559 while (nb > 1 && buf[nb - 1] == '\0')
00560 nb--;
00561
00562
00563
00564
00565
00566
00567
00568 if (looks_ascii(buf, nb, ubuf, &ulen)) {
00569 code = "ASCII";
00570 code_mime = "us-ascii";
00571 type = "text";
00572 } else if (looks_utf8(buf, nb, ubuf, &ulen)) {
00573 code = "UTF-8 Unicode";
00574 code_mime = "utf-8";
00575 type = "text";
00576 } else if ((i = looks_unicode(buf, nb, ubuf, &ulen))) {
00577 if (i == 1)
00578 code = "Little-endian UTF-16 Unicode";
00579 else
00580 code = "Big-endian UTF-16 Unicode";
00581
00582 type = "character data";
00583 code_mime = "utf-16";
00584 } else if (looks_latin1(buf, nb, ubuf, &ulen)) {
00585 code = "ISO-8859";
00586 type = "text";
00587 code_mime = "iso-8859-1";
00588 } else if (looks_extended(buf, nb, ubuf, &ulen)) {
00589 code = "Non-ISO extended-ASCII";
00590 type = "text";
00591 code_mime = "unknown";
00592 } else {
00593 from_ebcdic(buf, nb, nbuf);
00594
00595 if (looks_ascii(nbuf, nb, ubuf, &ulen)) {
00596 code = "EBCDIC";
00597 type = "character data";
00598 code_mime = "ebcdic";
00599 } else if (looks_latin1(nbuf, nb, ubuf, &ulen)) {
00600 code = "International EBCDIC";
00601 type = "character data";
00602 code_mime = "ebcdic";
00603 } else {
00604 return 0;
00605 }
00606 }
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616 if (*ubuf == '.') {
00617 unichar *tp = ubuf + 1;
00618
00619 while (ISSPC(*tp))
00620 ++tp;
00621 if ((tp[0] == '\\' && tp[1] == '\"') ||
00622 (isascii(tp[0]) && isalnum(tp[0]) &&
00623 isascii(tp[1]) && isalnum(tp[1]) &&
00624 ISSPC(tp[2]))) {
00625 subtype_mime = "text/troff";
00626 subtype = "troff or preprocessor input";
00627 goto subtype_identified;
00628 }
00629 }
00630
00631 if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00632 subtype_mime = "text/fortran";
00633 subtype = "fortran program";
00634 goto subtype_identified;
00635 }
00636
00637
00638
00639 i = 0;
00640 while (i < ulen) {
00641 int end;
00642
00643
00644
00645
00646 while (i < ulen && ISSPC(ubuf[i]))
00647 i++;
00648 if (i >= ulen)
00649 break;
00650
00651
00652
00653
00654 for (end = i + 1; end < nb; end++)
00655 if (ISSPC(ubuf[end]))
00656 break;
00657
00658
00659
00660
00661
00662 for (p = names; p < names + NNAMES; p++)
00663
00664 {
00665 if (p->name == NULL)
00666 break;
00667 if (fmagicAMatch(p->name, ubuf + i, end - i)) {
00668 subtype = types[p->type].human;
00669 subtype_mime = types[p->type].mime;
00670 goto subtype_identified;
00671 }
00672 }
00673
00674 i = end;
00675 }
00676
00677 subtype_identified:
00678
00679
00680
00681
00682 for (i = 0; i < ulen; i++) {
00683 if (i > last_line_end + MAXLINELEN)
00684 has_long_lines = 1;
00685
00686 if (ubuf[i] == '\033')
00687 has_escapes = 1;
00688 if (ubuf[i] == '\b')
00689 has_backspace = 1;
00690
00691 if (ubuf[i] == '\r' && (i + 1 < ulen && ubuf[i + 1] == '\n')) {
00692 n_crlf++;
00693 last_line_end = i;
00694 }
00695 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
00696 n_cr++;
00697 last_line_end = i;
00698 }
00699 if (ubuf[i] == '\n' && (i - 1 < 0 || ubuf[i - 1] != '\r')) {
00700 n_lf++;
00701 last_line_end = i;
00702 }
00703 if (ubuf[i] == 0x85) {
00704 n_nel++;
00705 last_line_end = i;
00706 }
00707 }
00708
00709 if ((fm->flags & FMAGIC_FLAGS_MIME)) {
00710 if (subtype_mime != NULL)
00711 fmagicPrintf(fm, subtype_mime);
00712 else
00713 fmagicPrintf(fm, "text/plain");
00714
00715 if (code_mime != NULL) {
00716 fmagicPrintf(fm, "; charset=");
00717 fmagicPrintf(fm, code_mime);
00718 }
00719 } else {
00720 fmagicPrintf(fm, code);
00721
00722 if (subtype != NULL) {
00723 fmagicPrintf(fm, " ");
00724 fmagicPrintf(fm, subtype);
00725 }
00726 fmagicPrintf(fm, " ");
00727 fmagicPrintf(fm, type);
00728
00729 if (has_long_lines)
00730 fmagicPrintf(fm, ", with very long lines");
00731
00732
00733
00734
00735
00736 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00737 (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00738 fmagicPrintf(fm, ", with");
00739
00740 if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
00741 fmagicPrintf(fm, " no");
00742 else {
00743 if (n_crlf) {
00744 fmagicPrintf(fm, " CRLF");
00745 if (n_cr || n_lf || n_nel)
00746 fmagicPrintf(fm, ",");
00747 }
00748 if (n_cr) {
00749 fmagicPrintf(fm, " CR");
00750 if (n_lf || n_nel)
00751 fmagicPrintf(fm, ",");
00752 }
00753 if (n_lf) {
00754 fmagicPrintf(fm, " LF");
00755 if (n_nel)
00756 fmagicPrintf(fm, ",");
00757 }
00758 if (n_nel)
00759 fmagicPrintf(fm, " NEL");
00760 }
00761
00762 fmagicPrintf(fm, " line terminators");
00763 }
00764
00765 if (has_escapes)
00766 fmagicPrintf(fm, ", with escape sequences");
00767 if (has_backspace)
00768 fmagicPrintf(fm, ", with overstriking");
00769 }
00770
00771 return 1;
00772 }
00773