--- src/awffull.c.orig 2008-12-13 11:28:35.000000000 +0900 +++ src/awffull.c 2008-12-31 16:43:45.000000000 +0900 @@ -37,6 +37,9 @@ /* STANDARD INCLUDES */ /*********************************************/ #include "awffull.h" /* main header */ +#ifdef HAVE_ICONV +#include +#endif /* internal function prototypes */ @@ -137,6 +140,10 @@ static char const ab_month_name[][4] = { "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; +#ifdef HAVE_ICONV +iconv_t cd_from_sjis, cd_from_eucj; +#endif + /*********************************************/ /* MAIN - start here */ /*********************************************/ @@ -339,6 +346,11 @@ main(int argc, char *argv[]) start_time = times(&mytms); +#ifdef HAVE_ICONV + cd_from_sjis = iconv_open("UTF-8", "Shift_JIS"); + cd_from_eucj = iconv_open("UTF-8", "EUC-JP"); +#endif + /********************************************* * MAIN PROCESS LOOP - read through log file * *********************************************/ @@ -801,9 +813,17 @@ main(int argc, char *argv[]) } del_htabs(); +#ifdef HAVE_ICONV + iconv_close(cd_from_sjis); + iconv_close(cd_from_eucj); +#endif /* Whew, all done! Exit with completion status (0) */ exit(0); } else { +#ifdef HAVE_ICONV + iconv_close(cd_from_sjis); + iconv_close(cd_from_eucj); +#endif /* No valid records found... exit with error (1) */ VPRINT(VERBOSE1, "%s\n", _("No valid records found!")); exit(1); @@ -1740,6 +1760,26 @@ unescape(char *str) if (!str) return NULL; /* make sure strings valid */ + /* for apache log's escape code. */ + while (*cp1) { + if (*cp1 == '\\' && *(cp1 + 1) == 'x' && + isxdigit(*(cp1 + 2)) && isxdigit(*(cp1 + 3))) { + *cp2 = from_hex(*(cp1 + 2)) * 16 + from_hex(*(cp1 + 3)); + if ((*cp2 < 32) || (*cp2 == 127)) + *cp2 = '_'; + cp1 += 4; + cp2++; + } else if (*cp1 == '\\' && *(cp1 + 1) == '\\') { + *cp2 = '\\'; + cp1 += 2; + cp2++; + } else { + *cp2++ = *cp1++; + } + } + *cp2 = *cp1; + cp1 = cp2 = str; + while (*cp1) { if (*cp1 == '%') { /* Found an escape? */ cp1++; @@ -1762,16 +1802,139 @@ unescape(char *str) return str; /* return the string */ } +#ifdef HAVE_ICONV + +/*********************************************/ +/* SCORE_XXX - calculate score */ +/*********************************************/ + +int score_eucj(unsigned char *str) +{ + int stat=0; + int score=0; + int bad=0; + if(str==NULL) return -1; + + for(; *str!=0;str++){ + switch(stat){ + case 0: + if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII + else if(*str >= 0xa1 && *str <= 0xfe) stat=1; //KANJI(1) + else if(*str == 0x8f); // HOJYO KANJI + else if(*str == 0x8e) stat=2; // KANA + else if(*str < 0x20); //CTRL + else bad=1; + break; + case 1: + if(*str >= 0xa1 && *str <= 0xfe) score += 2; //KANJI(2) + else bad=1; + stat=0; + break; + case 2: + if(*str >= 0xa1 && *str <= 0xdf); //hankaku <- 0 + else bad=1; + stat=0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + +int score_sjis(unsigned char *str) +{ + int stat=0; + int score=0; + int bad=0; + if(str==NULL) return -1; + + for(; *str != 0; str++){ + switch(stat){ + case 0: + if(*str>= 0x20 && *str <= 0x7e) score++;//ASCII + else if((*str >= 0x81 && *str <= 0x9f) || + (*str >= 0xe0 && *str <= 0xfc)) stat=1; //SJIS(1) + else if(*str >= 0xa1 && *str <= 0xdf); // KANA + else if(*str < 0x20); // CTRL + else bad=1; + break; + case 1: + if((*str >= 0x40 && *str <= 0x7e) || + (*str >= 0x80 && *str <= 0xfc)) score += 2; //SJIS(2) + else bad=1; + stat=0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + +int score_utf8(unsigned char *str) +{ + int stat=0; + int score=0; + int bad=0; + if(str==NULL) return -1; + + for(; *str != 0; str++){ + switch(stat){ + case 0: + if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII + else if(*str >= 0xc0 && *str <= 0xdf) stat=1; //greek etc. + else if(*str >= 0xe0 && *str <= 0xef) stat=2; //KANJI etc. + else if(*str >= 0xf0 && *str <= 0xf7) stat=4; + else if(*str < 0x20); //CTRL + else bad=1; + break; + case 1: + if(*str >= 0x80 && *str <= 0xbf) score++; + else bad=1; + stat=0; + break; + case 2: + if(*str >= 0x80 && *str <= 0xbf) stat=3; //KANJI(2) + else {bad=1; stat=0;} + break; + case 3: + if(*str >= 0x80 && *str <= 0xbf) score+=3; //KANJI(3) + else bad=1; + stat=0; + break; + case 4: + case 5: + if(*str >= 0x80 && *str <= 0xbf) stat++; + else {bad=1; stat=0;} + break; + case 6: + if(*str >= 0x80 && *str <= 0xbf) score+=4; + else bad=1; + stat=0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + +#endif + /*********************************************/ /* SRCH_STRING - get search strings from ref */ /*********************************************/ void srch_string(char *refer, char *ptr) { - char tmpbuf[BUFSIZE]; - char srch[80] = ""; - char *cp1, *cp2, *cps; + unsigned char tmpbuf[BUFSIZE]; + unsigned char srch[80] = ""; + unsigned char *cp1, *cp2, *cps; int sp_flg = 0; +#ifdef HAVE_ICONV + int sjis, eucj, utf8; + unsigned char tmpbuf2[BUFSIZE]; + unsigned char *cp3; + size_t inlen, outlen; +#endif /* Check if search engine referrer or return */ if ((cps = isinlist(search_list, refer)) == NULL) @@ -1832,6 +1995,35 @@ srch_string(char *refer, char *ptr) else break; +#ifdef HAVE_ICONV + utf8 = score_utf8(cp2); + sjis = score_sjis(cp2); + eucj = score_eucj(cp2); + if (sjis > utf8 && sjis > eucj) { + iconv(cd_from_sjis, NULL, 0, NULL, 0); + cp3 = cp2; + inlen = strlen(cp2) + 1; + cp1 = tmpbuf2; + outlen = sizeof(tmpbuf2); + if (iconv(cd_from_sjis, + (const char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 && + inlen == 0) { + cp2 = tmpbuf2; + } + } else if (eucj > utf8 && eucj > sjis) { + iconv(cd_from_eucj, NULL, 0, NULL, 0); + cp3 = cp2; + inlen = strlen(cp2) + 1; + cp1 = tmpbuf2; + outlen = sizeof(tmpbuf2); + if (iconv(cd_from_eucj, + (const char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 && + inlen == 0) { + cp2 = tmpbuf2; + } + } +#endif + /* strip invalid chars */ cp1 = cp2; while (*cp1 != '\0') { @@ -2391,6 +2583,7 @@ cleanup_refer(char *refer, char *srchstr /* unescape referrer */ unescape(refer); + unescape(refer); /* XXX */ /* fix referrer field */ cp1 = refer;