00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include "system.h"
00038 #include "file.h"
00039 #include "names.h"
00040 #include "tar.h"
00041 #include "debug.h"
00042
00043 FILE_RCSID("@(#)Id: ascmagic.c,v 1.32 2002/07/03 18:26:37 christos Exp ")
00044
00045
00046
00047
00048
00049
00050
00051 #define isodigit(c) ( ((c) >= '0') && ((c) <= '7') )
00052
00053
00054
00055
00056
00057
00058
00059 static int
00060 from_oct(int digs, char *where)
00061
00062 {
00063 int value;
00064
00065 while (isspace((unsigned char)*where)) {
00066 where++;
00067 if (--digs <= 0)
00068 return -1;
00069 }
00070 value = 0;
00071
00072 while (digs > 0 && isodigit(*where)) {
00073 value = (value << 3) | (*where++ - '0');
00074 --digs;
00075 }
00076
00077
00078 if (digs > 0 && *where && !isspace((unsigned char)*where))
00079 return -1;
00080
00081 return value;
00082 }
00083
00084
00085
00086
00087
00088
00089
00090
00091 static int
00092 is_tar(const fmagic fm)
00093
00094 {
00095 int nb = fm->nb;
00096 union record *header = (union record *)fm->buf;
00097 int i;
00098 int sum, recsum;
00099 char *p;
00100
00101 if (nb < sizeof(*header))
00102 return 0;
00103
00104 recsum = from_oct(8, header->header.chksum);
00105
00106 sum = 0;
00107 p = header->charptr;
00108
00109 for (i = sizeof(union record); --i >= 0;)
00110
00111 {
00112
00113
00114
00115
00116 sum += 0xFF & *p++;
00117 }
00118
00119
00120 for (i = sizeof(header->header.chksum); --i >= 0;)
00121 sum -= 0xFF & header->header.chksum[i];
00122 sum += ' ' * sizeof header->header.chksum;
00123
00124 if (sum != recsum)
00125 return 0;
00126
00127 if (!strcmp(header->header.magic, TARMAGIC))
00128 return 2;
00129
00130 return 1;
00131 }
00132 typedef unsigned long unichar;
00133
00134 #define MAXLINELEN 300
00135 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00136 || (x) == 0x85 || (x) == '\f')
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190 #define F 0
00191 #define T 1
00192 #define I 2
00193 #define X 3
00194
00195
00196 static char text_chars[256] = {
00197
00198 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,
00199
00200 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,
00201 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00202 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00203 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00204 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00205 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00206 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,
00207
00208 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,
00209 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,
00210 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00211 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00212 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00213 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00214 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00215 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I
00216 };
00217
00218
00219 static int
00220 looks_ascii(const unsigned char *buf, int nb,
00221 unichar *ubuf, int *ulen)
00222
00223 {
00224 int i;
00225
00226 *ulen = 0;
00227
00228 for (i = 0; i < nb; i++) {
00229 int t = text_chars[buf[i]];
00230
00231 if (t != T)
00232 return 0;
00233
00234 ubuf[(*ulen)++] = buf[i];
00235 }
00236
00237 return 1;
00238 }
00239
00240
00241
00242 static int
00243 looks_latin1(const unsigned char *buf, int nb,
00244 unichar *ubuf, int *ulen)
00245
00246 {
00247 int i;
00248
00249 *ulen = 0;
00250
00251 for (i = 0; i < nb; i++) {
00252 int t = text_chars[buf[i]];
00253
00254 if (t != T && t != I)
00255 return 0;
00256
00257 ubuf[(*ulen)++] = buf[i];
00258 }
00259
00260 return 1;
00261 }
00262
00263
00264
00265 static int
00266 looks_extended(const unsigned char *buf, int nb,
00267 unichar *ubuf, int *ulen)
00268
00269 {
00270 int i;
00271
00272 *ulen = 0;
00273
00274 for (i = 0; i < nb; i++) {
00275 int t = text_chars[buf[i]];
00276
00277 if (t != T && t != I && t != X)
00278 return 0;
00279
00280 ubuf[(*ulen)++] = buf[i];
00281 }
00282
00283 return 1;
00284 }
00285
00286
00287
00288 static int
00289 looks_utf8(const unsigned char *buf, int nb,
00290 unichar *ubuf, int *ulen)
00291
00292 {
00293 int i, n;
00294 unichar c;
00295 int gotone = 0;
00296
00297 *ulen = 0;
00298
00299 for (i = 0; i < nb; i++) {
00300 if ((buf[i] & 0x80) == 0) {
00301
00302
00303
00304
00305
00306 if (text_chars[buf[i]] != T)
00307 return 0;
00308
00309 ubuf[(*ulen)++] = buf[i];
00310 } else if ((buf[i] & 0x40) == 0) {
00311 return 0;
00312 } else {
00313 int following;
00314
00315 if ((buf[i] & 0x20) == 0) {
00316 c = buf[i] & 0x1f;
00317 following = 1;
00318 } else if ((buf[i] & 0x10) == 0) {
00319 c = buf[i] & 0x0f;
00320 following = 2;
00321 } else if ((buf[i] & 0x08) == 0) {
00322 c = buf[i] & 0x07;
00323 following = 3;
00324 } else if ((buf[i] & 0x04) == 0) {
00325 c = buf[i] & 0x03;
00326 following = 4;
00327 } else if ((buf[i] & 0x02) == 0) {
00328 c = buf[i] & 0x01;
00329 following = 5;
00330 } else
00331 return 0;
00332
00333 for (n = 0; n < following; n++) {
00334 i++;
00335 if (i >= nb)
00336 goto done;
00337
00338 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00339 return 0;
00340
00341 c = (c << 6) + (buf[i] & 0x3f);
00342 }
00343
00344 ubuf[(*ulen)++] = c;
00345 gotone = 1;
00346 }
00347 }
00348 done:
00349 return gotone;
00350 }
00351
00352
00353
00354 static int
00355 looks_unicode(const unsigned char *buf, int nb,
00356 unichar *ubuf, int *ulen)
00357
00358 {
00359 int bigend;
00360 int i;
00361
00362 if (nb < 2)
00363 return 0;
00364
00365 if (buf[0] == 0xff && buf[1] == 0xfe)
00366 bigend = 0;
00367 else if (buf[0] == 0xfe && buf[1] == 0xff)
00368 bigend = 1;
00369 else
00370 return 0;
00371
00372 *ulen = 0;
00373
00374 for (i = 2; i + 1 < nb; i += 2) {
00375
00376
00377 if (bigend)
00378 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00379 else
00380 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00381
00382 if (ubuf[*ulen - 1] == 0xfffe)
00383 return 0;
00384 if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T)
00385 return 0;
00386 }
00387
00388 return 1;
00389 }
00390
00391
00392 #undef F
00393 #undef T
00394 #undef I
00395 #undef X
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420 static unsigned char ebcdic_to_ascii[] = {
00421 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
00422 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
00423 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
00424 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
00425 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00426 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00427 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00428 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00429 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00430 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00431 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00432 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00433 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00434 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00435 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00436 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00437 };
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453 #ifdef UNUSED
00454
00455 static unsigned char ebcdic_1047_to_8859[] = {
00456 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00457 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00458 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00459 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00460 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00461 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00462 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00463 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00464 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00465 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00466 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00467 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00468 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00469 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00470 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00471 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00472 };
00473 #endif
00474
00475
00476
00477
00478
00479 static void
00480 from_ebcdic(const unsigned char *buf, int nb, unsigned char *otp)
00481
00482 {
00483 int i;
00484
00485 for (i = 0; i < nb; i++) {
00486 otp[i] = ebcdic_to_ascii[buf[i]];
00487 }
00488 }
00489
00490
00491
00492 static int
00493 fmagicAMatch(const unsigned char *s, const unichar *us, int ulen)
00494
00495 {
00496 size_t i;
00497
00498 for (i = 0; i < ulen; i++) {
00499 if (s[i] != us[i])
00500 return 0;
00501 }
00502
00503 if (s[i])
00504 return 0;
00505 else
00506 return 1;
00507 }
00508
00509
00510
00511
00512 int
00513 fmagicA(fmagic fm)
00514 {
00515 unsigned char * buf = fm->buf;
00516 int nb = fm->nb;
00517
00518 char nbuf[HOWMANY+1];
00519 unichar ubuf[HOWMANY+1];
00520 int ulen;
00521 struct names *p;
00522 int i;
00523
00524 const char *code = NULL;
00525 const char *code_mime = NULL;
00526 const char *type = NULL;
00527 const char *subtype = NULL;
00528 const char *subtype_mime = NULL;
00529
00530 int has_escapes = 0;
00531 int has_backspace = 0;
00532
00533 int n_crlf = 0;
00534 int n_lf = 0;
00535 int n_cr = 0;
00536 int n_nel = 0;
00537
00538 int last_line_end = -1;
00539 int has_long_lines = 0;
00540
00541
00542
00543
00544
00545 switch (is_tar(fm)) {
00546 case 1:
00547 file_printf(fm, ((fm->flags & FMAGIC_FLAGS_MIME)
00548 ? "application/x-tar" : "tar archive"));
00549 return 1;
00550 case 2:
00551 file_printf(fm, ((fm->flags & FMAGIC_FLAGS_MIME)
00552 ? "application/x-tar, POSIX" : "POSIX tar archive"));
00553 return 1;
00554 }
00555
00556
00557
00558
00559
00560
00561 while (nb > 1 && buf[nb - 1] == '\0')
00562 nb--;
00563
00564
00565
00566
00567
00568
00569
00570 if (looks_ascii(buf, nb, ubuf, &ulen)) {
00571 code = "ASCII";
00572 code_mime = "us-ascii";
00573 type = "text";
00574 } else if (looks_utf8(buf, nb, ubuf, &ulen)) {
00575 code = "UTF-8 Unicode";
00576 code_mime = "utf-8";
00577 type = "text";
00578 } else if ((i = looks_unicode(buf, nb, ubuf, &ulen))) {
00579 if (i == 1)
00580 code = "Little-endian UTF-16 Unicode";
00581 else
00582 code = "Big-endian UTF-16 Unicode";
00583
00584 type = "character data";
00585 code_mime = "utf-16";
00586 } else if (looks_latin1(buf, nb, ubuf, &ulen)) {
00587 code = "ISO-8859";
00588 type = "text";
00589 code_mime = "iso-8859-1";
00590 } else if (looks_extended(buf, nb, ubuf, &ulen)) {
00591 code = "Non-ISO extended-ASCII";
00592 type = "text";
00593 code_mime = "unknown";
00594 } else {
00595 from_ebcdic(buf, nb, nbuf);
00596
00597 if (looks_ascii(nbuf, nb, ubuf, &ulen)) {
00598 code = "EBCDIC";
00599 type = "character data";
00600 code_mime = "ebcdic";
00601 } else if (looks_latin1(nbuf, nb, ubuf, &ulen)) {
00602 code = "International EBCDIC";
00603 type = "character data";
00604 code_mime = "ebcdic";
00605 } else {
00606 return 0;
00607 }
00608 }
00609
00610
00611
00612
00613
00614
00615
00616
00617
00618 if (*ubuf == '.') {
00619 unichar *tp = ubuf + 1;
00620
00621 while (ISSPC(*tp))
00622 ++tp;
00623 if ((tp[0] == '\\' && tp[1] == '\"') ||
00624 (isascii(tp[0]) && isalnum(tp[0]) &&
00625 isascii(tp[1]) && isalnum(tp[1]) &&
00626 ISSPC(tp[2]))) {
00627 subtype_mime = "text/troff";
00628 subtype = "troff or preprocessor input";
00629 goto subtype_identified;
00630 }
00631 }
00632
00633 if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00634 subtype_mime = "text/fortran";
00635 subtype = "fortran program";
00636 goto subtype_identified;
00637 }
00638
00639
00640
00641 i = 0;
00642 while (i < ulen) {
00643 int end;
00644
00645
00646
00647
00648 while (i < ulen && ISSPC(ubuf[i]))
00649 i++;
00650 if (i >= ulen)
00651 break;
00652
00653
00654
00655
00656 for (end = i + 1; end < nb; end++)
00657 if (ISSPC(ubuf[end]))
00658 break;
00659
00660
00661
00662
00663
00664 for (p = names; p < names + NNAMES; p++)
00665
00666 {
00667 if (p->name == NULL)
00668 break;
00669 if (fmagicAMatch(p->name, ubuf + i, end - i)) {
00670 subtype = types[p->type].human;
00671 subtype_mime = types[p->type].mime;
00672 goto subtype_identified;
00673 }
00674 }
00675
00676 i = end;
00677 }
00678
00679 subtype_identified:
00680
00681
00682
00683
00684 for (i = 0; i < ulen; i++) {
00685 if (i > last_line_end + MAXLINELEN)
00686 has_long_lines = 1;
00687
00688 if (ubuf[i] == '\033')
00689 has_escapes = 1;
00690 if (ubuf[i] == '\b')
00691 has_backspace = 1;
00692
00693 if (ubuf[i] == '\r' && (i + 1 < ulen && ubuf[i + 1] == '\n')) {
00694 n_crlf++;
00695 last_line_end = i;
00696 }
00697 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
00698 n_cr++;
00699 last_line_end = i;
00700 }
00701 if (ubuf[i] == '\n' && (i - 1 < 0 || ubuf[i - 1] != '\r')) {
00702 n_lf++;
00703 last_line_end = i;
00704 }
00705 if (ubuf[i] == 0x85) {
00706 n_nel++;
00707 last_line_end = i;
00708 }
00709 }
00710
00711 if ((fm->flags & FMAGIC_FLAGS_MIME)) {
00712 if (subtype_mime != NULL)
00713 file_printf(fm, subtype_mime);
00714 else
00715 file_printf(fm, "text/plain");
00716
00717 if (code_mime != NULL) {
00718 file_printf(fm, "; charset=");
00719 file_printf(fm, code_mime);
00720 }
00721 } else {
00722 file_printf(fm, code);
00723
00724 if (subtype != NULL) {
00725 file_printf(fm, " ");
00726 file_printf(fm, subtype);
00727 }
00728 file_printf(fm, " ");
00729 file_printf(fm, type);
00730
00731 if (has_long_lines)
00732 file_printf(fm, ", with very long lines");
00733
00734
00735
00736
00737
00738 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00739 (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00740 file_printf(fm, ", with");
00741
00742 if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
00743 file_printf(fm, " no");
00744 else {
00745 if (n_crlf) {
00746 file_printf(fm, " CRLF");
00747 if (n_cr || n_lf || n_nel)
00748 file_printf(fm, ",");
00749 }
00750 if (n_cr) {
00751 file_printf(fm, " CR");
00752 if (n_lf || n_nel)
00753 file_printf(fm, ",");
00754 }
00755 if (n_lf) {
00756 file_printf(fm, " LF");
00757 if (n_nel)
00758 file_printf(fm, ",");
00759 }
00760 if (n_nel)
00761 file_printf(fm, " NEL");
00762 }
00763
00764 file_printf(fm, " line terminators");
00765 }
00766
00767 if (has_escapes)
00768 file_printf(fm, ", with escape sequences");
00769 if (has_backspace)
00770 file_printf(fm, ", with overstriking");
00771 }
00772
00773 return 1;
00774 }
00775